コード例 #1
0
ファイル: ssd.py プロジェクト: skyhookml/skyhookml
            def __init__(self, info):
                super(SSD, self).__init__()
                self.infer = info['infer']
                detection_metadata = info['metadatas'][1]
                if detection_metadata and 'Categories' in detection_metadata:
                    self.categories = detection_metadata['Categories']
                else:
                    self.categories = ['object']
                self.num_classes = len(self.categories) + 1
                lib.eprint('ssd: set num_classes={}'.format(self.num_classes))

                self.mode = info['params'].get('mode', 'mb2-ssd-lite')
                mb2_width_mult = info['params'].get('mb2_width_mult', 1.0)

                # adapt from train_ssd.py
                if self.mode == 'vgg16-ssd':
                    create_net = create_vgg_ssd
                    config = vgg_ssd_config
                elif self.mode == 'mb1-ssd':
                    create_net = create_mobilenetv1_ssd
                    config = mobilenetv1_ssd_config
                elif self.mode == 'mb1-ssd-lite':
                    create_net = create_mobilenetv1_ssd_lite
                    config = mobilenetv1_ssd_config
                elif self.mode == 'sq-ssd-lite':
                    create_net = create_squeezenet_ssd_lite
                    config = squeezenet_ssd_config
                elif self.mode == 'mb2-ssd-lite':
                    create_net = lambda num, is_test: create_mobilenetv2_ssd_lite(
                        num, width_mult=mb2_width_mult, is_test=is_test)
                    config = mobilenetv1_ssd_config
                elif self.mode == 'mb3-large-ssd-lite':
                    create_net = lambda num: create_mobilenetv3_large_ssd_lite(
                        num, is_test=is_test)
                    config = mobilenetv1_ssd_config
                elif self.mode == 'mb3-small-ssd-lite':
                    create_net = lambda num: create_mobilenetv3_small_ssd_lite(
                        num, is_test=is_test)
                    config = mobilenetv1_ssd_config

                config.iou_threshold = info['params'].get(
                    'iou_threshold', config.iou_threshold)
                self.prob_threshold = info['params'].get(
                    'confidence_threshold', 0.01)
                self.config = config

                self.model = create_net(self.num_classes, is_test=self.infer)
                self.criterion = MultiboxLoss(config.priors,
                                              iou_threshold=0.5,
                                              neg_pos_ratio=3,
                                              center_variance=0.1,
                                              size_variance=0.2,
                                              device=info['device'])
                self.match_prior = MatchPrior(config.priors,
                                              config.center_variance,
                                              config.size_variance, 0.5)
                self.image_mean = torch.tensor(self.config.image_mean,
                                               dtype=torch.float32).reshape(
                                                   1, 3, 1,
                                                   1).to(info['device'])
コード例 #2
0
def res_test(dataset, net, device):
    config = mobilenetv1_ssd_config
    criterion = MultiboxLoss(config.priors,
                             iou_threshold=0.5,
                             neg_pos_ratio=3,
                             center_variance=0.1,
                             size_variance=0.2,
                             device=device)
    target_transform = MatchPrior(config.priors, config.center_variance,
                                  config.size_variance, 0.5)
    test_transform = TestTransform(config.image_size, config.image_mean,
                                   config.image_std)
    val_dataset = SKUDataset(dataset,
                             transform=test_transform,
                             target_transform=target_transform,
                             mode='1')
    loader = DataLoader(val_dataset,
                        args.batch_size,
                        num_workers=args.num_workers,
                        shuffle=False)

    net.eval()
    running_loss = 0.0
    running_regression_loss = 0.0
    running_classification_loss = 0.0
    num = 0
    for i, data in enumerate(loader):

        images, boxes, labels = data
        images = images.to(device)
        boxes = boxes.to(device)
        labels = labels.to(device)
        num += 1

        with torch.no_grad():
            confidence, locations = net(images)
            regression_loss, classification_loss = criterion(
                confidence, locations, labels, boxes)
            loss = regression_loss + classification_loss

        running_loss += loss.item()
        running_regression_loss += regression_loss.item()
        running_classification_loss += classification_loss.item()
        if i % 50 == 0:
            logger.info(f"Step: {i} in Test - loss : {loss}. ")

    return running_loss / num, running_regression_loss / num, running_classification_loss / num
コード例 #3
0
ファイル: train_ssd.py プロジェクト: alfarih31/grit-fr-ml
    timer.start("Load Model")
    if args.resume:
        logging.info(f"Resume from the model {args.resume}")
        net.load_state_dict(torch.load(args.resume, map_location=lambda storage, loc: storage))
    elif args.base_net:
        logging.info(f"Init from base net {args.base_net}")
        net.init_from_base_net(args.base_net)
    elif args.pretrained_ssd:
        logging.info(f"Init from pretrained ssd {args.pretrained_ssd}")
        net.init_from_pretrained_ssd(args.pretrained_ssd)
    logging.info(f'Took {timer.end("Load Model"):.2f} seconds to load the model.')

    net.to(DEVICE)

    criterion = MultiboxLoss(config.priors, iou_threshold=0.45, neg_pos_ratio=3,
                             center_variance=0.1, size_variance=0.2, device=DEVICE)
    optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum,
                                weight_decay=args.weight_decay)
    logging.info(f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, "
                 + f"Extra Layers learning rate: {extra_layers_lr}.")

    if args.scheduler == 'multi-step':
        logging.info("Uses MultiStepLR scheduler.")
        milestones = [int(v.strip()) for v in args.milestones.split(",")]
        scheduler = MultiStepLR(optimizer, milestones=milestones,
                                                     gamma=0.1,)
    elif args.scheduler == 'cosine':
        logging.info("Uses CosineAnnealingLR scheduler.")
        scheduler = CosineAnnealingLR(optimizer, args.t_max,)
    elif args.scheduler == 'step':
        logging.info("Uses Step scheduler.")
コード例 #4
0
def optim_and_model_initial(args, net, timer, config, DEVICE):
    #net = create_net(num_classes)
    last_epoch = -1

    base_net_lr = args['Training_hyperparam']['base_net_lr'] if args[
        'Training_hyperparam']['base_net_lr'] != "None" else args[
            'Training_hyperparam']['lr']
    extra_layers_lr = args['Training_hyperparam']['extra_layers_lr'] if args[
        'Training_hyperparam']['extra_layers_lr'] != "None" else args[
            'Training_hyperparam']['lr']
    if args['flow_control']['freeze_base_net']:
        logging.info("Freeze base net.")
        freeze_net_layers(net.base_net)
        params = itertools.chain(net.source_layer_add_ons.parameters(),
                                 net.extras.parameters(),
                                 net.regression_headers.parameters(),
                                 net.classification_headers.parameters())
        params = [{
            'params':
            itertools.chain(net.source_layer_add_ons.parameters(),
                            net.extras.parameters()),
            'lr':
            extra_layers_lr
        }, {
            'params':
            itertools.chain(net.regression_headers.parameters(),
                            net.classification_headers.parameters())
        }]
    elif args['flow_control']['freeze_net']:
        freeze_net_layers(net.base_net)
        freeze_net_layers(net.source_layer_add_ons)
        freeze_net_layers(net.extras)
        params = itertools.chain(net.regression_headers.parameters(),
                                 net.classification_headers.parameters())
        logging.info("Freeze all the layers except prediction heads.")
    else:
        params = [{
            'params': net.base_net.parameters(),
            'lr': base_net_lr
        }, {
            'params':
            itertools.chain(net.source_layer_add_ons.parameters(),
                            net.extras.parameters()),
            'lr':
            extra_layers_lr
        }, {
            'params':
            itertools.chain(net.regression_headers.parameters(),
                            net.classification_headers.parameters())
        }]

    timer.start("Load Model")

    if args['flow_control']['resume']:
        logging.info("Resume from the model {}".format(
            args['flow_control']['resume']))
        net.load(args['flow_control']['resume'])
    elif args['flow_control']['base_net']:
        logging.info("Init from base net {}".format(
            args['flow_control']['base_net']))
        net.init_from_base_net(args['flow_control']['base_net'])
    elif args['flow_control']['pretrained_ssd']:
        logging.info("Init from pretrained ssd {}".format(
            args['flow_control']['pretrained_ssd']))
        net.init_from_pretrained_ssd(args['flow_control']['pretrained_ssd'])
    logging.info('Took {:.2f} seconds to load the model.'.format(
        timer.end("Load Model")))

    # net.to(DEVICE)
    net = nn.DataParallel(net).cuda()
    neg_pos_ratio = 3  #3

    criterion = MultiboxLoss(config.priors,
                             iou_threshold=0.5,
                             neg_pos_ratio=neg_pos_ratio,
                             center_variance=0.1,
                             size_variance=0.2,
                             device=DEVICE)
    # criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=1,
    #                          center_variance=0.1, size_variance=0.2, device=DEVICE)
    optimizer = torch.optim.SGD(
        params,
        lr=args['Training_hyperparam']['lr'],
        momentum=args['Training_hyperparam']['momentum'],
        weight_decay=args['Training_hyperparam']['weighted_decay'])
    logging.info("Learning rate: {}, Base net learning rate: {}, ".format(
        args['Training_hyperparam']['lr'], base_net_lr) +
                 "Extra Layers learning rate: {}.".format(extra_layers_lr))

    if args['Training_hyperparam']['lr_scheduler'] == 'multi-step':
        logging.info("Uses MultiStepLR scheduler.")
        milestones = [
            int(v.strip()) for v in args["Training_hyperparam"]
            ["lr_scheduler_param"]["multi-step"]['milestones'].split(",")
        ]
        scheduler = MultiStepLR(optimizer,
                                milestones=milestones,
                                gamma=args["Training_hyperparam"]
                                ["lr_scheduler_param"]["multi-step"]['gamma'],
                                last_epoch=last_epoch)
    elif args['Training_hyperparam']['lr_scheduler'] == 'cosine':
        logging.info("Uses CosineAnnealingLR scheduler.")
        scheduler = CosineAnnealingLR(
            optimizer,
            float(args['Training_hyperparam']['lr_scheduler_param']['cosine']
                  ['t_max']),
            last_epoch=last_epoch)
    else:
        logging.fatal("Unsupported Scheduler: {}.".format(
            args['Training_hyperparam']['lr_scheduler']))
        parser.print_help(sys.stderr)
        sys.exit(1)

    logging.info("Start training from epoch {}.".format(last_epoch + 1))

    return net, criterion, optimizer, scheduler
コード例 #5
0
def main(args):
    DEVICE = torch.device(
        "cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu")
    #DEVICE = torch.device("cpu")
    if args.use_cuda and torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True
        logging.info("Use Cuda.")

    timer = Timer()

    logging.info(args)
    if args.net == 'vgg16-ssd':
        create_net = create_vgg_ssd
        config = vgg_ssd_config
    elif args.net == 'mb1-ssd':
        create_net = create_mobilenetv1_ssd
        config = mobilenetv1_ssd_config
    elif args.net == 'mb1-ssd-lite':
        create_net = create_mobilenetv1_ssd_lite
        config = mobilenetv1_ssd_config
    elif args.net == 'sq-ssd-lite':
        create_net = create_squeezenet_ssd_lite
        config = squeezenet_ssd_config
    elif args.net == 'mb2-ssd-lite':
        create_net = lambda num: create_mobilenetv2_ssd_lite(
            num, width_mult=args.mb2_width_mult)
        config = mobilenetv1_ssd_config
    else:
        logging.fatal("The net type is wrong.")
        parser.print_help(sys.stderr)
        sys.exit(1)
    train_transform = TrainAugmentation(config.image_size, config.image_mean,
                                        config.image_std)
    target_transform = MatchPrior(config.priors, config.center_variance,
                                  config.size_variance, 0.5)

    test_transform = TestTransform(config.image_size, config.image_mean,
                                   config.image_std)

    logging.info("Prepare training datasets.")
    datasets = []
    for dataset_path in args.datasets:
        if args.dataset_type == 'voc':
            dataset = VOCDataset(dataset_path,
                                 transform=train_transform,
                                 target_transform=target_transform)
            label_file = os.path.join(args.checkpoint_folder,
                                      "voc-model-labels.txt")
            store_labels(label_file, dataset.class_names)
            num_classes = len(dataset.class_names)
        elif args.dataset_type == 'open_images':
            dataset = OpenImagesDataset(dataset_path,
                                        transform=train_transform,
                                        target_transform=target_transform,
                                        dataset_type="train",
                                        balance_data=args.balance_data)
            label_file = os.path.join(args.checkpoint_folder,
                                      "open-images-model-labels.txt")
            store_labels(label_file, dataset.class_names)
            logging.info(dataset)
            num_classes = len(dataset.class_names)
        elif args.dataset_type == 'coco':
            # root, annFile, transform=None, target_transform=None, transforms=None)
            #  dataset_type="train", balance_data=args.balance_data)
            dataset = CocoDetection(
                "/home/wenyen4desh/datasets/coco/train2017",
                "/home/wenyen4desh/datasets/coco/annotations/instances_train2017.json",
                transform=train_transform,
                target_transform=target_transform)

            label_file = os.path.join(args.checkpoint_folder,
                                      "open-images-model-labels.txt")
            store_labels(label_file, dataset.class_names)
            logging.info(dataset)
            num_classes = len(dataset.class_names)
            # raise ValueError("Dataset type {} yet implement.".format(args.dataset_type))
        else:
            raise ValueError("Dataset type {} is not supported.".format(
                args.dataset_type))
        datasets.append(dataset)
    logging.info("Stored labels into file {}.".format(label_file))
    train_dataset = ConcatDataset(datasets)
    logging.info("Train dataset size: {}".format(len(train_dataset)))
    train_loader = DataLoader(train_dataset,
                              args.batch_size,
                              num_workers=args.num_workers,
                              shuffle=True)
    logging.info("Prepare Validation datasets.")
    if args.dataset_type == "voc":
        val_dataset = VOCDataset(args.validation_dataset,
                                 transform=test_transform,
                                 target_transform=target_transform,
                                 is_test=True)
    elif args.dataset_type == 'open_images':
        val_dataset = OpenImagesDataset(dataset_path,
                                        transform=test_transform,
                                        target_transform=target_transform,
                                        dataset_type="test")
        logging.info(val_dataset)
    elif args.dataset_type == "coco":
        val_dataset = CocoDetection(
            "/home/wenyen4desh/datasets/coco/val2017",
            "/home/wenyen4desh/datasets/coco/annotations/instances_val2017.json",
            transform=test_transform,
            target_transform=target_transform)
        logging.info(val_dataset)
    logging.info("validation dataset size: {}".format(len(val_dataset)))

    val_loader = DataLoader(val_dataset,
                            args.batch_size,
                            num_workers=args.num_workers,
                            shuffle=False)
    logging.info("Build network.")
    net = create_net(num_classes)
    min_loss = -10000.0
    last_epoch = -1

    base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr
    extra_layers_lr = args.extra_layers_lr if args.extra_layers_lr is not None else args.lr
    if args.freeze_base_net:
        logging.info("Freeze base net.")
        freeze_net_layers(net.base_net)
        params = itertools.chain(net.source_layer_add_ons.parameters(),
                                 net.extras.parameters(),
                                 net.regression_headers.parameters(),
                                 net.classification_headers.parameters())
        params = [{
            'params':
            itertools.chain(net.source_layer_add_ons.parameters(),
                            net.extras.parameters()),
            'lr':
            extra_layers_lr
        }, {
            'params':
            itertools.chain(net.regression_headers.parameters(),
                            net.classification_headers.parameters())
        }]
    elif args.freeze_net:
        freeze_net_layers(net.base_net)
        freeze_net_layers(net.source_layer_add_ons)
        freeze_net_layers(net.extras)
        params = itertools.chain(net.regression_headers.parameters(),
                                 net.classification_headers.parameters())
        logging.info("Freeze all the layers except prediction heads.")
    else:
        params = [{
            'params': net.base_net.parameters(),
            'lr': base_net_lr
        }, {
            'params':
            itertools.chain(net.source_layer_add_ons.parameters(),
                            net.extras.parameters()),
            'lr':
            extra_layers_lr
        }, {
            'params':
            itertools.chain(net.regression_headers.parameters(),
                            net.classification_headers.parameters())
        }]

    timer.start("Load Model")
    if args.resume:
        logging.info("Resume from the model {}".format(args.resume))
        net.load(args.resume)
    elif args.base_net:
        logging.info("Init from base net {}".format(args.base_net))
        net.init_from_base_net(args.base_net)
    elif args.pretrained_ssd:
        logging.info("Init from pretrained ssd {}".format(args.pretrained_ssd))
        net.init_from_pretrained_ssd(args.pretrained_ssd)
    logging.info('Took {:.2f} seconds to load the model.'.format(
        timer.end("Load Model")))

    net.to(DEVICE)

    criterion = MultiboxLoss(config.priors,
                             iou_threshold=0.5,
                             neg_pos_ratio=3,
                             center_variance=0.1,
                             size_variance=0.2,
                             device=DEVICE)
    optimizer = torch.optim.SGD(params,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    logging.info("Learning rate: {}, Base net learning rate: {}, ".format(
        args.lr, base_net_lr) +
                 "Extra Layers learning rate: {}.".format(extra_layers_lr))

    if args.scheduler == 'multi-step':
        logging.info("Uses MultiStepLR scheduler.")
        milestones = [int(v.strip()) for v in args.milestones.split(",")]
        scheduler = MultiStepLR(optimizer,
                                milestones=milestones,
                                gamma=0.1,
                                last_epoch=last_epoch)
    elif args.scheduler == 'cosine':
        logging.info("Uses CosineAnnealingLR scheduler.")
        scheduler = CosineAnnealingLR(optimizer,
                                      args.t_max,
                                      last_epoch=last_epoch)
    else:
        logging.fatal("Unsupported Scheduler: {}.".format(args.scheduler))
        parser.print_help(sys.stderr)
        sys.exit(1)

    logging.info("Start training from epoch {}.".format(last_epoch + 1))
    for epoch in range(last_epoch + 1, args.num_epochs):
        scheduler.step()
        train(train_loader,
              net,
              criterion,
              optimizer,
              device=DEVICE,
              debug_steps=args.debug_steps,
              epoch=epoch)

        if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1:
            val_loss, val_regression_loss, val_classification_loss = test(
                val_loader, net, criterion, DEVICE)
            logging.info("Epoch: {}, ".format(epoch) +
                         "Validation Loss: {:.4f}, ".format(val_loss) +
                         "Validation Regression Loss {:.4f}, ".format(
                             val_regression_loss) +
                         "Validation Classification Loss: {:.4f}".format(
                             val_classification_loss))
            model_path = os.path.join(
                args.checkpoint_folder,
                "{}-Epoch-{}-Loss-{}.pth".format(args.net, epoch, val_loss))
            net.save(model_path)
            logging.info("Saved model {}".format(model_path))
コード例 #6
0
    timer.start("Load Model")
    if args.resume:
        logging.info(f"Resume from the model {args.resume}")
        net.load(args.resume)
    elif args.base_net:
        logging.info(f"Init from base net {args.base_net}")
        net.init_from_base_net(args.base_net)
    elif args.pretrained_ssd:
        logging.info(f"Init from pretrained ssd {args.pretrained_ssd}")
        net.init_from_pretrained_ssd(args.pretrained_ssd)
    logging.info(
        f'Took {timer.end("Load Model"):.2f} seconds to load the model.')

    criterion = MultiboxLoss(config.priors,
                             neg_pos_ratio=3,
                             center_variance=0.1,
                             size_variance=0.2)

    if args.optimizer_type != "Adam":
        if args.scheduler == 'multi-step':
            logging.info("Uses MultiStepLR scheduler.")
            milestones = [int(v.strip()) for v in args.milestones.split(",")]
            scheduler = MultiStepDecay(args.lr,
                                       milestones=milestones,
                                       gamma=0.1,
                                       last_epoch=last_epoch)
        elif args.scheduler == 'cosine':
            logging.info("Uses CosineAnnealingLR scheduler.")
            scheduler = CosineAnnealingDecay(args.lr,
                                             args.t_max,
                                             last_epoch=last_epoch)
コード例 #7
0
    def setup_and_start_training(self):
        logging.basicConfig(
            stream=sys.stdout,
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

        DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and self.
                              system_dict["params"]["use_cuda"] else "cpu")

        if self.system_dict["params"]["use_cuda"] and torch.cuda.is_available(
        ):
            torch.backends.cudnn.benchmark = True
            logging.info("Using gpu.")
        else:
            logging.info("Using cpu.")

        timer = Timer()
        logging.info(self.system_dict)

        if self.system_dict["params"]["net"] == 'vgg16-ssd':
            create_net = create_vgg_ssd
            config = vgg_ssd_config
        elif self.system_dict["params"]["net"] == 'mb1-ssd':
            create_net = create_mobilenetv1_ssd
            config = mobilenetv1_ssd_config
        elif self.system_dict["params"]["net"] == 'mb1-ssd-lite':
            create_net = create_mobilenetv1_ssd_lite
            config = mobilenetv1_ssd_config
        elif self.system_dict["params"]["net"] == 'sq-ssd-lite':
            create_net = create_squeezenet_ssd_lite
            config = squeezenet_ssd_config
        elif self.system_dict["params"]["net"] == 'mb2-ssd-lite':
            create_net = lambda num: create_mobilenetv2_ssd_lite(
                num, width_mult=self.system_dict["params"]["mb2_width_mult"])
            config = mobilenetv1_ssd_config
        else:
            logging.fatal("The net type is wrong.")
            sys.exit(1)

        train_transform = TrainAugmentation(config.image_size,
                                            config.image_mean,
                                            config.image_std)
        target_transform = MatchPrior(config.priors, config.center_variance,
                                      config.size_variance, 0.5)

        test_transform = TestTransform(config.image_size, config.image_mean,
                                       config.image_std)

        logging.info("Prepare training datasets.")
        datasets = []
        dataset = VOCDataset(
            self.system_dict["dataset"]["val"]["img_dir"],
            self.system_dict["dataset"]["val"]["label_dir"],
            transform=train_transform,
            target_transform=target_transform,
            label_file=self.system_dict["params"]["label_file"])
        label_file = self.system_dict["params"]["label_file"]
        #store_labels(label_file, dataset.class_names)
        num_classes = len(dataset.class_names)
        datasets.append(dataset)
        logging.info(f"Stored labels into file {label_file}.")
        train_dataset = ConcatDataset(datasets)
        logging.info("Train dataset size: {}".format(len(train_dataset)))
        train_loader = DataLoader(
            train_dataset,
            self.system_dict["params"]["batch_size"],
            num_workers=self.system_dict["params"]["num_workers"],
            shuffle=True)

        if (self.system_dict["dataset"]["val"]["status"]):
            val_dataset = VOCDataset(
                self.system_dict["dataset"]["val"]["img_dir"],
                self.system_dict["dataset"]["val"]["label_dir"],
                transform=test_transform,
                target_transform=target_transform,
                is_test=True,
                label_file=self.system_dict["params"]["label_file"])
            logging.info("validation dataset size: {}".format(
                len(val_dataset)))
            val_loader = DataLoader(
                val_dataset,
                self.system_dict["params"]["batch_size"],
                num_workers=self.system_dict["params"]["num_workers"],
                shuffle=False)

        logging.info("Build network.")
        net = create_net(num_classes)
        min_loss = -10000.0
        last_epoch = -1

        base_net_lr = self.system_dict["params"][
            "base_net_lr"] if self.system_dict["params"][
                "base_net_lr"] is not None else self.system_dict["params"]["lr"]
        extra_layers_lr = self.system_dict["params"][
            "extra_layers_lr"] if self.system_dict["params"][
                "extra_layers_lr"] is not None else self.system_dict["params"][
                    "lr"]

        if self.system_dict["params"]["freeze_base_net"]:
            logging.info("Freeze base net.")
            freeze_net_layers(net.base_net)
            params = itertools.chain(net.source_layer_add_ons.parameters(),
                                     net.extras.parameters(),
                                     net.regression_headers.parameters(),
                                     net.classification_headers.parameters())
            params = [{
                'params':
                itertools.chain(net.source_layer_add_ons.parameters(),
                                net.extras.parameters()),
                'lr':
                extra_layers_lr
            }, {
                'params':
                itertools.chain(net.regression_headers.parameters(),
                                net.classification_headers.parameters())
            }]
        elif self.system_dict["params"]["freeze_net"]:
            freeze_net_layers(net.base_net)
            freeze_net_layers(net.source_layer_add_ons)
            freeze_net_layers(net.extras)
            params = itertools.chain(net.regression_headers.parameters(),
                                     net.classification_headers.parameters())
            logging.info("Freeze all the layers except prediction heads.")
        else:
            params = [{
                'params': net.base_net.parameters(),
                'lr': base_net_lr
            }, {
                'params':
                itertools.chain(net.source_layer_add_ons.parameters(),
                                net.extras.parameters()),
                'lr':
                extra_layers_lr
            }, {
                'params':
                itertools.chain(net.regression_headers.parameters(),
                                net.classification_headers.parameters())
            }]

        timer.start("Load Model")
        resume = self.system_dict["params"]["resume"]
        base_net = self.system_dict["params"]["base_net"]
        pretrained_ssd = self.system_dict["params"]["pretrained_ssd"]
        if self.system_dict["params"]["resume"]:
            logging.info(f"Resume from the model {resume}")
            net.load(self.system_dict["params"]["resume"])
        elif self.system_dict["params"]["base_net"]:
            logging.info(f"Init from base net {base_net}")
            net.init_from_base_net(self.system_dict["params"]["base_net"])
        elif self.system_dict["params"]["pretrained_ssd"]:
            logging.info(f"Init from pretrained ssd {pretrained_ssd}")
            net.init_from_pretrained_ssd(
                self.system_dict["params"]["pretrained_ssd"])
        logging.info(
            f'Took {timer.end("Load Model"):.2f} seconds to load the model.')

        net.to(DEVICE)

        criterion = MultiboxLoss(config.priors,
                                 iou_threshold=0.5,
                                 neg_pos_ratio=3,
                                 center_variance=0.1,
                                 size_variance=0.2,
                                 device=DEVICE)
        optimizer = torch.optim.SGD(
            params,
            lr=self.system_dict["params"]["lr"],
            momentum=self.system_dict["params"]["momentum"],
            weight_decay=self.system_dict["params"]["weight_decay"])
        lr = self.system_dict["params"]["lr"]
        logging.info(
            f"Learning rate: {lr}, Base net learning rate: {base_net_lr}, " +
            f"Extra Layers learning rate: {extra_layers_lr}.")

        if (not self.system_dict["params"]["milestones"]):
            self.system_dict["params"]["milestones"] = ""
            self.system_dict["params"]["milestones"] += str(
                int(self.system_dict["params"]["num_epochs"] / 3)) + ","
            self.system_dict["params"]["milestones"] += str(
                int(2 * self.system_dict["params"]["num_epochs"] / 3))

        if self.system_dict["params"]["scheduler"] == 'multi-step':
            logging.info("Uses MultiStepLR scheduler.")
            milestones = [
                int(v.strip())
                for v in self.system_dict["params"]["milestones"].split(",")
            ]
            scheduler = MultiStepLR(optimizer,
                                    milestones=milestones,
                                    gamma=0.1,
                                    last_epoch=last_epoch)
        elif self.system_dict["params"]["scheduler"] == 'cosine':
            logging.info("Uses CosineAnnealingLR scheduler.")
            scheduler = CosineAnnealingLR(optimizer,
                                          self.system_dict["params"]["t_max"],
                                          last_epoch=last_epoch)

        logging.info(f"Start training from epoch {last_epoch + 1}.")
        for epoch in range(last_epoch + 1,
                           self.system_dict["params"]["num_epochs"]):
            scheduler.step()
            self.base_train(
                train_loader,
                net,
                criterion,
                optimizer,
                device=DEVICE,
                debug_steps=self.system_dict["params"]["debug_steps"],
                epoch=epoch)

            if ((self.system_dict["dataset"]["val"]["status"]) and
                (epoch % self.system_dict["params"]["validation_epochs"] == 0
                 or epoch == self.system_dict["params"]["num_epochs"] - 1)):
                val_loss, val_regression_loss, val_classification_loss = self.base_test(
                    val_loader, net, criterion, DEVICE)
                logging.info(
                    f"Epoch: {epoch}, " +
                    f"Validation Loss: {val_loss:.4f}, " +
                    f"Validation Regression Loss {val_regression_loss:.4f}, " +
                    f"Validation Classification Loss: {val_classification_loss:.4f}"
                )
                net_name = self.system_dict["params"]["net"]
                model_path = os.path.join(
                    self.system_dict["params"]["checkpoint_folder"],
                    f"{net_name}-Epoch-{epoch}-Loss-{val_loss}.pth")
                net.save(model_path)
                logging.info(f"Saved model {model_path}")
            if (not self.system_dict["dataset"]["val"]["status"]):
                model_path = os.path.join(
                    self.system_dict["params"]["checkpoint_folder"],
                    f"{net_name}-Epoch-{epoch}.pth")
                net.save(model_path)
                logging.info(f"Saved model {model_path}")
コード例 #8
0
def main():
    script_dir = os.path.dirname(__file__)
    module_path = os.path.abspath(os.path.join(script_dir, '..', '..'))
    global msglogger

    # Parse arguments
    args = parser.get_parser().parse_args()
    if args.epochs is None:
        args.epochs = 90

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    msglogger = apputils.config_pylogger(
        os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir,
        args.verbose)

    # Log various details about the execution environment.  It is sometimes useful
    # to refer to past experiment executions and this information may be useful.
    apputils.log_execution_env_state(
        filter(None, [args.compress, args.qe_stats_file
                      ]),  # remove both None and empty strings
        msglogger.logdir,
        gitroot=module_path)
    msglogger.debug("Distiller: %s", distiller.__version__)

    if args.evaluate:
        args.deterministic = True
    if args.deterministic:
        distiller.set_deterministic(
            args.seed)  # For experiment reproducability
    else:
        if args.seed is not None:
            distiller.set_seed(args.seed)
        # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image
        # classification models, as the input sizes don't change during the run
        # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3
        cudnn.benchmark = True

    start_epoch = 0
    ending_epoch = args.epochs
    perf_scores_history = []

    if args.cpu or not torch.cuda.is_available():
        # Set GPU index to -1 if using CPU
        args.device = 'cpu'
        args.gpus = -1
    else:
        args.device = 'cuda'
        if args.gpus is not None:
            try:
                args.gpus = [int(s) for s in args.gpus.split(',')]
            except ValueError:
                raise ValueError(
                    'ERROR: Argument --gpus must be a comma-separated list of integers only'
                )
            available_gpus = torch.cuda.device_count()
            for dev_id in args.gpus:
                if dev_id >= available_gpus:
                    raise ValueError(
                        'ERROR: GPU device ID {0} requested, but only {1} devices available'
                        .format(dev_id, available_gpus))
            # Set default device in case the first one on the list != 0
            torch.cuda.set_device(args.gpus[0])

    # Infer the dataset from the model name
    args.dataset = distiller.apputils.classification_dataset_str_from_arch(
        args.arch)
    args.num_classes = distiller.apputils.classification_num_classes(
        args.dataset)

    if args.earlyexit_thresholds:
        args.num_exits = len(args.earlyexit_thresholds) + 1
        args.loss_exits = [0] * args.num_exits
        args.losses_exits = []
        args.exiterrors = []

    # Create the model
    model, config = create_model(args.pretrained,
                                 args.dataset,
                                 args.arch,
                                 parallel=not args.load_serialized,
                                 device_ids=args.gpus)

    compression_scheduler = None
    # Create a couple of logging backends.  TensorBoardLogger writes log files in a format
    # that can be read by Google's Tensor Board.  PythonLogger writes to the Python logger.
    tflogger = TensorBoardLogger(msglogger.logdir)
    pylogger = PythonLogger(msglogger)

    # capture thresholds for early-exit training
    if args.earlyexit_thresholds:
        msglogger.info('=> using early-exit threshold values of %s',
                       args.earlyexit_thresholds)

    # TODO(barrh): args.deprecated_resume is deprecated since v0.3.1
    if args.deprecated_resume:
        msglogger.warning(
            'The "--resume" flag is deprecated. Please use "--resume-from=YOUR_PATH" instead.'
        )
        if not args.reset_optimizer:
            msglogger.warning(
                'If you wish to also reset the optimizer, call with: --reset-optimizer'
            )
            args.reset_optimizer = True
        args.resumed_checkpoint_path = args.deprecated_resume

    # We can optionally resume from a checkpoint
    optimizer = None
    if args.resumed_checkpoint_path:
        model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint(
            model, args.resumed_checkpoint_path, model_device=args.device)
    elif args.load_model_path:
        model = apputils.load_lean_checkpoint(model,
                                              args.load_model_path,
                                              model_device=args.device)
    if args.reset_optimizer:
        start_epoch = 0
        if optimizer is not None:
            optimizer = None
            msglogger.info(
                '\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0'
            )

    # Define loss function (criterion)
    if "ssd" in args.arch:
        neg_pos_ratio = 3
        criterion = MultiboxLoss(config.priors,
                                 iou_threshold=0.5,
                                 neg_pos_ratio=neg_pos_ratio,
                                 center_variance=0.1,
                                 size_variance=0.2,
                                 device=args.device,
                                 reduction="sum",
                                 class_reduction=True,
                                 verbose=0)
    else:
        criterion = nn.CrossEntropyLoss().to(args.device)

    if optimizer is None:
        if "ssd" in args.arch:
            base_net_lr = args.lr
            extra_layers_lr = args.lr
            params = [{
                'params': model.base_net.parameters(),
                'lr': base_net_lr
            }, {
                'params':
                itertools.chain(model.source_layer_add_ons.parameters(),
                                model.extras.parameters()),
                'lr':
                extra_layers_lr
            }, {
                'params':
                itertools.chain(model.regression_headers.parameters(),
                                model.classification_headers.parameters())
            }]
        else:
            params = model.parameters()
        optimizer = torch.optim.SGD(params,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
        msglogger.info('Optimizer Type: %s', type(optimizer))
        msglogger.info('Optimizer Args: %s', optimizer.defaults)

    if args.AMC:
        return automated_deep_compression(model, criterion, optimizer,
                                          pylogger, args)
    if args.greedy:
        return greedy(model, criterion, optimizer, pylogger, args)

    # This sample application can be invoked to produce various summary reports.
    if args.summary:
        for summary in args.summary:
            distiller.model_summary(model, summary, args.dataset)
        return

    if args.export_onnx is not None:
        return distiller.export_img_classifier_to_onnx(model,
                                                       os.path.join(
                                                           msglogger.logdir,
                                                           args.export_onnx),
                                                       args.dataset,
                                                       add_softmax=True,
                                                       verbose=False)

    if args.qe_calibration:
        return acts_quant_stats_collection(model, criterion, pylogger, args)

    if args.activation_histograms:
        return acts_histogram_collection(model, criterion, pylogger, args)

    activations_collectors = create_activation_stats_collectors(
        model, *args.activation_stats)

    # Load the datasets: the dataset to load is inferred from the model name passed
    # in args.arch.  The default dataset is ImageNet, but if args.arch contains the
    # substring "_cifar", then cifar10 is used.
    train_loader, val_loader, test_loader, _ = load_data(args, config=config)
    msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d',
                   len(train_loader.sampler), len(val_loader.sampler),
                   len(test_loader.sampler))

    if args.sensitivity is not None:
        sensitivities = np.arange(args.sensitivity_range[0],
                                  args.sensitivity_range[1],
                                  args.sensitivity_range[2])
        return sensitivity_analysis(model, criterion, test_loader, pylogger,
                                    args, sensitivities)

    if args.evaluate:
        return evaluate_model(model, criterion, test_loader, pylogger,
                              activations_collectors, args,
                              compression_scheduler)

    if args.compress:
        # The main use-case for this sample application is CNN compression. Compression
        # requires a compression schedule configuration file in YAML.
        compression_scheduler = distiller.file_config(
            model, optimizer, args.compress, compression_scheduler,
            (start_epoch - 1) if args.resumed_checkpoint_path else None)
        # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer)
        model.to(args.device)
    elif compression_scheduler is None:
        compression_scheduler = distiller.CompressionScheduler(model)

    if args.thinnify:
        #zeros_mask_dict = distiller.create_model_masks_dict(model)
        assert args.resumed_checkpoint_path is not None, \
            "You must use --resume-from to provide a checkpoint file to thinnify"
        distiller.remove_filters(model,
                                 compression_scheduler.zeros_mask_dict,
                                 args.arch,
                                 args.dataset,
                                 optimizer=None)
        apputils.save_checkpoint(0,
                                 args.arch,
                                 model,
                                 optimizer=None,
                                 scheduler=compression_scheduler,
                                 name="{}_thinned".format(
                                     args.resumed_checkpoint_path.replace(
                                         ".pth.tar", "")),
                                 dir=msglogger.logdir)
        print(
            "Note: your model may have collapsed to random inference, so you may want to fine-tune"
        )
        return

    args.kd_policy = None
    if args.kd_teacher:
        teacher, _ = create_model(args.kd_pretrained,
                                  args.dataset,
                                  args.kd_teacher,
                                  parallel=not args.load_serialized,
                                  device_ids=args.gpus)
        if args.kd_resume:
            teacher = apputils.load_lean_checkpoint(teacher, args.kd_resume)
        dlw = distiller.DistillationLossWeights(args.kd_distill_wt,
                                                args.kd_student_wt,
                                                args.kd_teacher_wt)
        raw_teacher_model_path = msglogger.logdir + "/raw_teacher.pth.tar"
        if not os.path.exists(raw_teacher_model_path):
            teacher.save(raw_teacher_model_path)
            msglogger.info(Fore.CYAN + '\tRaw Teacher Model saved: {0}'.format(
                raw_teacher_model_path) + Style.RESET_ALL)
        args.kd_policy = distiller.KnowledgeDistillationPolicy(
            model,
            teacher,
            args.kd_temp,
            dlw,
            loss_type=args.kd_loss_type,
            focal_alpha=args.kd_focal_alpha,
            use_adaptive=args.kd_focal_adaptive,
            verbose=0)
        compression_scheduler.add_policy(args.kd_policy,
                                         starting_epoch=args.kd_start_epoch,
                                         ending_epoch=args.epochs,
                                         frequency=1)

        msglogger.info('\nStudent-Teacher knowledge distillation enabled:')
        msglogger.info('\tTeacher Model: %s', args.kd_teacher)
        msglogger.info('\tTemperature: %s', args.kd_temp)
        msglogger.info('\tLoss Weights (distillation | student | teacher): %s',
                       ' | '.join(['{:.2f}'.format(val) for val in dlw]))
        msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch)

    if start_epoch >= ending_epoch:
        msglogger.error(
            'epoch count is too low, starting epoch is {} but total epochs set to {}'
            .format(start_epoch, ending_epoch))
        raise ValueError('Epochs parameter is too low. Nothing to do.')

    for epoch in range(start_epoch, ending_epoch):
        # This is the main training loop.
        msglogger.info('\n')
        if compression_scheduler:
            compression_scheduler.on_epoch_begin(
                epoch, metrics=(vloss if (epoch != start_epoch) else 10**6))

        # Train for one epoch
        with collectors_context(activations_collectors["train"]) as collectors:
            train(train_loader,
                  model,
                  criterion,
                  optimizer,
                  epoch,
                  compression_scheduler,
                  loggers=[tflogger, pylogger],
                  args=args)
            distiller.log_weights_sparsity(model,
                                           epoch,
                                           loggers=[tflogger, pylogger])
            distiller.log_activation_statsitics(
                epoch,
                "train",
                loggers=[tflogger],
                collector=collectors["sparsity"])
            if args.masks_sparsity:
                msglogger.info(
                    distiller.masks_sparsity_tbl_summary(
                        model, compression_scheduler))

        # evaluate on validation set
        with collectors_context(activations_collectors["valid"]) as collectors:
            top1, top5, vloss = validate(val_loader, model, criterion,
                                         [pylogger], args, epoch)
            distiller.log_activation_statsitics(
                epoch,
                "valid",
                loggers=[tflogger],
                collector=collectors["sparsity"])
            save_collectors_data(collectors, msglogger.logdir)

        stats = ('Performance/Validation/',
                 OrderedDict([('Loss', vloss), ('Top1', top1),
                              ('Top5', top5)]))
        distiller.log_training_progress(stats,
                                        None,
                                        epoch,
                                        steps_completed=0,
                                        total_steps=1,
                                        log_freq=1,
                                        loggers=[tflogger])

        if compression_scheduler:
            compression_scheduler.on_epoch_end(epoch, optimizer)

        # Update the list of top scores achieved so far, and save the checkpoint
        update_training_scores_history(perf_scores_history, model, top1, top5,
                                       epoch, args.num_best_scores)
        is_best = epoch == perf_scores_history[0].epoch
        checkpoint_extras = {
            'current_top1': top1,
            'best_top1': perf_scores_history[0].top1,
            'best_epoch': perf_scores_history[0].epoch
        }
        try:
            raw_fullpath_best = apputils.save_checkpoint(
                epoch,
                args.arch,
                model,
                optimizer=optimizer,
                scheduler=compression_scheduler,
                extras=checkpoint_extras,
                is_best=is_best,
                name=args.name,
                dir=msglogger.logdir)
        except Exception as ex:
            # keep previous fullpath_best
            pass
        mlflow.log_artifacts(msglogger.logdir)

    # Finally run results on the test set
    eval_params = {
        "model_type": args.arch,
        "model_path": raw_fullpath_best,
        "dataset_path": args.data,
        "label_path": "models/voc-model-labels.txt"
    }
    mlflow.projects.run(uri=".",
                        entry_point="eval",
                        use_conda=False,
                        parameters=eval_params)
コード例 #9
0
def train_network(dataset_path, model_path, net_type):
    args.datasets = dataset_path
    args.validation_dataset = dataset_path
    args.checkpoint_folder = model_path
    args.log_dir = os.path.join(args.checkpoint_folder, 'log')
    args.net = net_type

    timer = Timer()

    logging.info(args)
    if args.net == 'slim':
        create_net = create_mb_tiny_fd
        config = fd_config
    elif args.net == 'RFB':
        create_net = create_Mb_Tiny_RFB_fd
        config = fd_config
    else:
        logging.fatal("The net type is wrong.")
        parser.print_help(sys.stderr)
        sys.exit(1)

    train_transform = TrainAugmentation(config.image_size, config.image_mean,
                                        config.image_std)
    target_transform = MatchPrior(config.priors, config.center_variance,
                                  config.size_variance, args.overlap_threshold)

    test_transform = TestTransform(config.image_size, config.image_mean_test,
                                   config.image_std)

    if not os.path.exists(args.checkpoint_folder):
        os.makedirs(args.checkpoint_folder)
    logging.info("Prepare training datasets.")
    datasets = []

    # voc datasets
    dataset = VOCDataset(dataset_path,
                         transform=train_transform,
                         target_transform=target_transform)
    label_file = os.path.join(args.checkpoint_folder, "voc-model-labels.txt")
    store_labels(label_file, dataset.class_names)
    num_classes = len(dataset.class_names)
    print('num_classes: ', num_classes)

    logging.info(f"Stored labels into file {label_file}.")
    # train_dataset = ConcatDataset(datasets)
    train_dataset = dataset
    logging.info("Train dataset size: {}".format(len(train_dataset)))
    train_loader = DataLoader(train_dataset,
                              args.batch_size,
                              num_workers=args.num_workers,
                              shuffle=True)
    logging.info("Prepare Validation datasets.")
    val_dataset = VOCDataset(args.validation_dataset,
                             transform=test_transform,
                             target_transform=target_transform,
                             is_test=True)

    logging.info("validation dataset size: {}".format(len(val_dataset)))

    val_loader = DataLoader(val_dataset,
                            args.batch_size,
                            num_workers=args.num_workers,
                            shuffle=False)
    logging.info("Build network.")
    net = create_net(num_classes)

    timer.start("Load Model")
    if args.resume:
        logging.info(f"Resume from the model {args.resume}")
        net.load(args.resume)
    logging.info(
        f'Took {timer.end("Load Model"):.2f} seconds to load the model.')

    # add multigpu_train
    if torch.cuda.device_count() >= 1:
        cuda_index_list = [int(v.strip()) for v in args.cuda_index.split(",")]
        net = nn.DataParallel(net, device_ids=cuda_index_list)
        logging.info("use gpu :{}".format(cuda_index_list))

    min_loss = -10000.0
    last_epoch = -1

    base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr
    extra_layers_lr = args.extra_layers_lr if args.extra_layers_lr is not None else args.lr
    params = [{
        'params': net.module.base_net.parameters(),
        'lr': base_net_lr
    }, {
        'params':
        itertools.chain(net.module.source_layer_add_ons.parameters(),
                        net.module.extras.parameters()),
        'lr':
        extra_layers_lr
    }, {
        'params':
        itertools.chain(net.module.regression_headers.parameters(),
                        net.module.classification_headers.parameters())
    }]

    net.to(DEVICE)
    criterion = MultiboxLoss(config.priors,
                             iou_threshold=args.iou_threshold,
                             neg_pos_ratio=5,
                             center_variance=0.1,
                             size_variance=0.2,
                             device=DEVICE,
                             num_classes=num_classes,
                             loss_type=args.loss_type)
    if args.optimizer_type == "SGD":
        optimizer = torch.optim.SGD(params,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    elif args.optimizer_type == "Adam":
        optimizer = torch.optim.Adam(params, lr=args.lr)
        logging.info("use Adam optimizer")
    else:
        logging.fatal(f"Unsupported optimizer: {args.scheduler}.")
        parser.print_help(sys.stderr)
        sys.exit(1)
    logging.info(
        f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, " +
        f"Extra Layers learning rate: {extra_layers_lr}.")
    if args.optimizer_type != "Adam":
        if args.scheduler == 'multi-step':
            logging.info("Uses MultiStepLR scheduler.")
            milestones = [int(v.strip()) for v in args.milestones.split(",")]
            scheduler = MultiStepLR(optimizer,
                                    milestones=milestones,
                                    gamma=0.1,
                                    last_epoch=last_epoch)
        elif args.scheduler == 'poly':
            logging.info("Uses PolyLR scheduler.")
        else:
            logging.fatal(f"Unsupported Scheduler: {args.scheduler}.")
            parser.print_help(sys.stderr)
            sys.exit(1)

    logging.info(f"Start training from epoch {last_epoch + 1}.")
    for epoch in range(last_epoch + 1, args.num_epochs):
        if args.optimizer_type != "Adam":
            if args.scheduler != "poly":
                if epoch != 0:
                    scheduler.step()
        train(train_loader,
              net,
              criterion,
              optimizer,
              device=DEVICE,
              debug_steps=args.debug_steps,
              epoch=epoch)
        if args.scheduler == "poly":
            adjust_learning_rate(optimizer, epoch)
        logging.info("epoch: {} lr rate :{}".format(
            epoch, optimizer.param_groups[0]['lr']))

        if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1:
            logging.info("validation epoch: {} lr rate :{}".format(
                epoch, optimizer.param_groups[0]['lr']))
            val_loss, val_regression_loss, val_classification_loss = test(
                val_loader, net, criterion, DEVICE)
            logging.info(
                f"Epoch: {epoch}, " + f"Validation Loss: {val_loss:.4f}, " +
                f"Validation Regression Loss {val_regression_loss:.4f}, " +
                f"Validation Classification Loss: {val_classification_loss:.4f}"
            )
            model_path = os.path.join(
                args.checkpoint_folder,
                f"{args.net}-Epoch-{epoch}-Loss-{val_loss:.4f}.pth")
            net.module.save(model_path)
            logging.info(f"Saved model {model_path}")