Example #1
0
    def __init__(self, opt, logger=None):
        super(Model, self).__init__(config, kwargs)
        self.opt = opt
        # cfgfile = 'yolo-voc.cfg'
        # self.detector = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        # in_features = self.detector.roi_heads.box_predictor.cls_score.in_features
        #
        # # replace the pre-trained head with a new one
        # self.detector.roi_heads.box_predictor = FastRCNNPredictor(in_features, config.DATA.NUM_CLASSESS + 1)
        self.detector = yolov4(inference=True,
                               n_classes=config.DATA.NUM_CLASSESS)

        # """
        # 预训练模型
        # """
        # pretrained_dict = torch.load('pretrained/yolov4.pth')
        # self.detector.load_state_dict(pretrained_dict)

        self.yolov4loss = Yolo_loss(device=opt.device, batch=opt.batch_size)
        #####################
        #    Init weights
        #####################
        # normal_init(self.detector)

        if opt.debug:
            print_network(self.detector)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join('checkpoints', opt.tag)
Example #2
0
    def load_config(self, config, hdfs_section):
        """

        :param config: ConfigParser object
        :param hdfs_section: section describing the hdfs configuration
        """

        assert hdfs_section in config.sections()

        options = config.options(hdfs_section)
        for option in options:
            if option == "namenode_handlers":
                self.namenode_handlers = config.getint(hdfs_section, option)
            elif option == "replica":
                self.replica = config.getint(hdfs_section, option)
            elif option == "namenode_scheduler":
                namenode_sched_section = config.get(hdfs_section, option)
                if namenode_sched_section == "None":
                    self.namenode_scheduler_generator = None
                else:
                    self.namenode_scheduler_generator = lambda env: get_scheduler(
                        env, config, namenode_sched_section)
            elif option == "datanode":
                datanode_config_section = config.get(hdfs_section, option)
                self.datanode_conf.load_config(config, datanode_config_section)
            else:
                print "Warning: unknown option for hdfs:", option
Example #3
0
    def __init__(self, opt, logger=None):
        super(Model, self).__init__()
        self.opt = opt
        self.logger = logger

        # 根据YoloV2和YoloV3使用不同的配置文件
        if opt.model == 'Yolo2':
            cfgfile = 'configs/yolo2-voc.cfg'
        elif opt.model == 'Yolo3':
            cfgfile = 'configs/yolo3-coco.cfg'

        # 初始化detector
        self.detector = Darknet(cfgfile, device=opt.device).to(opt.device)
        print_network(self.detector, logger=logger)

        # 在--load之前加载weights文件(可选)
        if opt.weights:
            utils.color_print('Load Yolo weights from %s.' % opt.weights, 3)
            self.detector.load_weights(opt.weights)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
Example #4
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.opt = opt
        self.classifier = Classifier(opt.model)  #.cuda(device=opt.device)
        #####################
        #    Init weights
        #####################
        # self.classifier.apply(weights_init)

        print_network(self.classifier)

        self.optimizer = get_optimizer(opt, self.classifier)
        self.scheduler = get_scheduler(opt, self.optimizer)

        # load networks
        # if opt.load:
        #     pretrained_path = opt.load
        #     self.load_network(self.classifier, 'G', opt.which_epoch, pretrained_path)
        # if self.training:
        #     self.load_network(self.discriminitor, 'D', opt.which_epoch, pretrained_path)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)

        # with open('datasets/class_weight.pkl', 'rb') as f:
        #     class_weight = pickle.load(f, encoding='bytes')
        #     class_weight = np.array(class_weight, dtype=np.float32)
        #     class_weight = torch.from_numpy(class_weight).to(opt.device)
        #     if opt.class_weight:
        #         self.criterionCE = nn.CrossEntropyLoss(weight=class_weight)
        #     else:
        self.criterionCE = nn.CrossEntropyLoss()
Example #5
0
    def __init__(self, opt, logger=None):
        super(Model, self).__init__()
        self.opt = opt
        self.logger = logger

        kargs = {}
        if opt.scale:
            min_size = opt.scale
            max_size = int(min_size / 3 * 4)
            kargs = {
                'min_size': min_size,
                'max_size': max_size,
            }

        kargs.update({'box_nms_thresh': nms_thresh})

        # 定义backbone和Faster RCNN模型
        if opt.backbone is None or opt.backbone.lower() in [
                'res50', 'resnet50'
        ]:
            # 默认是带fpn的resnet50
            self.detector = fasterrcnn_resnet50_fpn(pretrained=False, **kargs)

            in_features = self.detector.roi_heads.box_predictor.cls_score.in_features

            # replace the pre-trained head with a new one
            self.detector.roi_heads.box_predictor = FastRCNNPredictor(
                in_features, opt.num_classes + 1)

        elif opt.backbone.lower() in ['vgg16', 'vgg']:
            backbone = vgg16_backbone()
            self.detector = FasterRCNN(backbone,
                                       num_classes=opt.num_classes + 1,
                                       **kargs)

        elif opt.backbone.lower() in ['res101', 'resnet101']:
            # 不带FPN的resnet101
            backbone = res101_backbone()
            self.detector = FasterRCNN(backbone,
                                       num_classes=opt.num_classes + 1,
                                       **kargs)

        elif opt.backbone.lower() in ['res', 'resnet']:
            raise RuntimeError(
                f'backbone "{opt.backbone}" is ambiguous, please specify layers.'
            )

        else:
            raise NotImplementedError(f'no such backbone: {opt.backbone}')

        print_network(self.detector)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
Example #6
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.opt = opt
        self.classifier = Classifier(opt.model)
        # self.classifier.apply(weights_init)  # 初始化权重

        print_network(self.classifier)

        self.optimizer = get_optimizer(opt, self.classifier)
        self.scheduler = get_scheduler(opt, self.optimizer)
Example #7
0
def start_like_scheduler(campaign, api):
    logging.basicConfig()
    scheduler = get_scheduler()
    print "added"
    start = datetime.datetime.today().minute + 1
    job = scheduler.add_job(update_likes_q, 'cron', minute=start, \
            misfire_grace_time=None, args=(campaign.id,api,))
    #scheduler.add_job(pause_job, 'cron', minute=4, hour="4,8,12,16", args=(job.id,))
    campaign.job_id = job.id
    print "Job ID: {}".format(job.id)
    global_session.commit()
    return scheduler
Example #8
0
def start_like_scheduler(campaign, api):
    logging.basicConfig()
    scheduler = get_scheduler()
    print "added"
    start = datetime.datetime.today().minute + 1
    job = scheduler.add_job(update_likes_q, 'cron', minute=start, \
            misfire_grace_time=None, args=(campaign.id,api,))
    #scheduler.add_job(pause_job, 'cron', minute=4, hour="4,8,12,16", args=(job.id,))
    campaign.job_id=job.id
    print "Job ID: {}".format(job.id)
    global_session.commit()
    return scheduler
Example #9
0
    def load_config(self, config, node_section):
        """

        :param config: ConfigParser object
        :param node_section: section describing the node configuration
        """
        assert node_section in config.sections()
        options = config.options(node_section)
        for option in options:
            if option == "cpu_freq":
                self.cpu_freq = config.getfloat(node_section, option)
            elif option == "num_cpus":
                self.num_cpus = config.getint(node_section, option)
            elif option == "disk_bandwidth":
                self.disk_bandwidth = config.getfloat(node_section, option)
            elif option == "num_disks":
                self.num_disks = config.getint(node_section, option)
            elif option == "network_bandwidth":
                self.network_bandwidth = config.getfloat(node_section, option)
            elif option == "num_links":
                self.num_links = config.getint(node_section, option)
            elif option == "cpu_scheduler":
                cpu_sched_section = config.get(node_section, option)
                self.cpu_scheduler_generator = lambda env: get_scheduler(
                    env, config, cpu_sched_section)
            elif option == "io_scheduler":
                io_sched_section = config.get(node_section, option)
                self.io_scheduler_generator = lambda env: get_scheduler(
                    env, config, io_sched_section)
            elif option == "network_scheduler":
                net_sched_section = config.get(node_section, option)
                self.network_scheduler_generator = lambda env: get_scheduler(
                    env, config, net_sched_section)
            elif option == "resource_monitor_interval":
                self.resource_monitor_interval = config.getfloat(
                    node_section, option)
            else:
                raise ConfigError("Unknown phy_node option: " + option)
def train_new_model(model, train_queue, valid_queue, test_queue):
    ori_model = model.module if args.distributed else model
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)
    drop_layers = ori_model.drop_layers()
    criterion = get_criterion(args.classes, args.label_smoothing)

    for epoch in range(args.epochs):
        scheduler.step()
        if args.warmup and epoch < args.warmup_epochs:
            lr = args.learning_rate * epoch / args.warmup_epochs + args.warmup_lr
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            cond_logging('epoch %d lr %e', epoch, lr)
        else:
            lr = scheduler.get_lr()[0]
            cond_logging('epoch %d lr %e', epoch, lr)

        if args.distributed:
            train_queue.sampler.set_epoch(epoch)
        if args.epd:
            drop_rate = args.drop_rate * epoch / args.epochs
        else:
            drop_rate = args.drop_rate
        drop_rates = [drop_rate] * drop_layers
        if args.layerd:
            for i in range(drop_layers):
                drop_rates[i] = drop_rates[i] * (i + 1) / drop_layers
        ori_model.set_drop_rates(drop_rates)
        cond_logging('drop rates:')
        cond_logging(ori_model.drop_rates)

        #training
        train_acc, train_obj = train(train_queue, model, criterion, optimizer,
                                     lr, args.report_freq, args.world_size,
                                     args.distributed, args.local_rank)

        cond_logging('train acc %f', train_acc)
        #validation
        drop_rates = [0] * drop_layers
        ori_model.set_drop_rates(drop_rates)
        valid_acc, valid_obj = infer(valid_queue, model, criterion,
                                     args.report_freq, args.world_size,
                                     args.distributed, args.local_rank)
        cond_logging('valid acc %f', valid_acc)
        test_acc, test_obj = infer(test_queue, model, criterion,
                                   args.report_freq, args.world_size,
                                   args.distributed, args.local_rank)
        cond_logging('test acc %f', test_acc)
    return model
Example #11
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.opt = opt
        self.cleaner = FFA().to(device=opt.device)
        #####################
        #    Init weights
        #####################
        # normal_init(self.cleaner)

        print_network(self.cleaner)

        self.g_optimizer = get_optimizer(opt, self.cleaner)
        self.scheduler = get_scheduler(opt, self.g_optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
Example #12
0
    def __init__(self, opt, logger=None):
        super(Model, self).__init__(config, kwargs)
        self.opt = opt
        self.detector = get_net().to(device=opt.device)
        #####################
        #    Init weights
        #####################
        # normal_init(self.detector)

        if opt.debug:
            print_network(self.detector)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join('checkpoints', opt.tag)
Example #13
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.opt = opt
        self.detector = torchvision.models.detection.fasterrcnn_resnet50_fpn(
            pretrained=False)
        # self.detector = FasterRCNN_VGG()
        in_features = self.detector.roi_heads.box_predictor.cls_score.in_features

        # replace the pre-trained head with a new one
        self.detector.roi_heads.box_predictor = FastRCNNPredictor(
            in_features, opt.num_classes + 1)
        print_network(self.detector)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
Example #14
0
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(config, kwargs)
        self.config = config

        # 根据YoloV2和YoloV3使用不同的配置文件
        if config.MODEL.NAME == 'Yolo2':
            cfgfile = 'configs/networks/yolo2-voc.cfg'
        elif config.MODEL.NAME == 'Yolo3':
            cfgfile = 'configs/networks/yolo3-coco.cfg'

        # 初始化detector
        self.detector = Darknet(cfgfile, device=opt.device).to(opt.device)
        if opt.debug:
            print_network(self.detector)

        # 在--load之前加载weights文件(可选)
        if opt.load and opt.load[-2:] != 'pt':
            if is_first_gpu():
                utils.color_print('Load Yolo weights from %s.' % opt.load, 3)
            self.detector.load_weights(opt.load)
        elif 'LOAD' in config.MODEL and config.MODEL.LOAD[-2:] != 'pt':
            if is_first_gpu():
                utils.color_print(
                    'Load Yolo weights from %s.' % config.MODEL.LOAD, 3)
            self.detector.load_weights(config.MODEL.LOAD)

        self.to(opt.device)
        # 多GPU支持
        if is_distributed():
            self.detector = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
                self.detector)
            self.detector = torch.nn.parallel.DistributedDataParallel(
                self.detector,
                find_unused_parameters=False,
                device_ids=[opt.local_rank],
                output_device=opt.local_rank)
            # self.detector = torch.nn.parallel.DistributedDataParallel(self.detector, device_ids=[opt.local_rank], output_device=opt.local_rank)

        self.optimizer = get_optimizer(config, self.detector)
        self.scheduler = get_scheduler(config, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join('checkpoints', opt.tag)
Example #15
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.opt = opt
        cfgfile = 'configs/yolov5x.yaml'
        self.detector = Yolo5(cfgfile)
        self.detector.hyp = hyp
        self.detector.gr = 1.0
        self.detector.nc = opt.num_classes
        #####################
        #    Init weights
        #####################
        # normal_init(self.detector)

        print_network(self.detector)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
        self.it = 0
Example #16
0
def exec_train(config,
               train_data_loader,
               valid_data_loader,
               OUTPUT_DIR,
               fold,
               trained_epoch=0):
    # load model and make parallel
    device = torch.device('cuda:0')
    model = Model(config['model']).to(device)
    # model = get_model(config['model']).to(device)
    # model = torch.nn.DataParallel(model)

    # train setting
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = get_optimizer(config['train']['optimizer'], trainable_params)
    scheduler = get_scheduler(config['train']['scheduler'], optimizer)

    # log setting
    logger = Logger(model,
                    optimizer,
                    output_dir=OUTPUT_DIR,
                    run_name=RUN_NAME,
                    trained_epoch=trained_epoch,
                    config=config,
                    fold=fold + 1)

    # training
    for epoch in range(trained_epoch + 1, config['train']['epochs'] + 1):
        if config['general']['kfold'] < 0:
            print("\r [Epoch %d]" % epoch)
        else:
            print("\r [Fold %d : Epoch %d]" % (fold + 1, epoch))

        train_epoch(model, train_data_loader, logger, optimizer)
        evaluate_epoch(model, valid_data_loader, logger, optimizer)
        if scheduler is not None:
            scheduler.step(logger.last_valid_loss)

    logger.finish_training()
Example #17
0
    def __init__(self, opt, logger=None):
        super(Model, self).__init__(config, kwargs)
        self.opt = opt
        cfgfile = 'configs/yolov5x.yaml'
        self.detector = Yolo5(cfgfile)
        self.detector.hyp = hyp
        self.detector.gr = 1.0
        self.detector.nc = config.DATA.NUM_CLASSESS
        #####################
        #    Init weights
        #####################
        # normal_init(self.detector)

        if opt.debug:
            print_network(self.detector)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join('checkpoints', opt.tag)
        self.it = 0
Example #18
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.opt = opt
        self.classifier = Classifier()
        #####################
        #    Init weights
        #####################
        # self.classifier.apply(weights_init)

        print_network(self.classifier)

        self.optimizer = optim.Adam(self.classifier.parameters(),
                                    lr=opt.lr,
                                    betas=(0.95, 0.999))
        self.scheduler = get_scheduler(opt, self.optimizer)

        # load networks
        # if opt.load:
        #     pretrained_path = opt.load
        #     self.load_network(self.classifier, 'G', opt.which_epoch, pretrained_path)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
Example #19
0
    def __init__(self, opt, logger=None):
        super(Model, self).__init__()
        self.opt = opt
        # cfgfile = 'yolo-voc.cfg'
        # self.detector = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        # in_features = self.detector.roi_heads.box_predictor.cls_score.in_features
        #
        # # replace the pre-trained head with a new one
        # self.detector.roi_heads.box_predictor = FastRCNNPredictor(in_features, opt.num_classes + 1)
        self.detector = Retina_50(opt.num_classes,pretrained=True)

        #####################
        #    Init weights
        #####################
        # normal_init(self.detector)

        print_network(self.detector)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
    def __init__(self, opt, logger=None):
        super(Model, self).__init__()
        self.opt = opt
        self.detector = SSDDetector(opt).to(device=opt.device)
        #####################
        #    Init weights
        #####################
        # normal_init(self.detector)

        print_network(self.detector)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)

        CENTER_VARIANCE = 0.1
        SIZE_VARIANCE = 0.2
        THRESHOLD = 0.5

        self.target_transform = SSDTargetTransform(
            PriorBox(opt)(), CENTER_VARIANCE, SIZE_VARIANCE, THRESHOLD)
Example #21
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.opt = opt
        self.classifier = Classifier()  #.cuda(device=opt.device)
        #####################
        #    Init weights
        #####################
        # self.classifier.apply(weights_init)

        print_network(self.classifier)

        self.optimizer = get_optimizer(opt, self.classifier)
        self.scheduler = get_scheduler(opt, self.optimizer)

        # load networks
        # if opt.load:
        #     pretrained_path = opt.load
        #     self.load_network(self.classifier, 'G', opt.which_epoch, pretrained_path)
        # if self.training:
        #     self.load_network(self.discriminitor, 'D', opt.which_epoch, pretrained_path)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
Example #22
0
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(config, kwargs)
        self.config = config
        self.detector = SSDDetector(config).to(device=opt.device)
        #####################
        #    Init weights
        #####################
        # normal_init(self.detector)

        if opt.debug:
            print_network(self.detector)

        self.optimizer = get_optimizer(config, self.detector)
        self.scheduler = get_scheduler(config, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join('checkpoints', opt.tag)

        CENTER_VARIANCE = 0.1
        SIZE_VARIANCE = 0.2
        THRESHOLD = 0.5

        self.target_transform = SSDTargetTransform(
            PriorBox(config)(), CENTER_VARIANCE, SIZE_VARIANCE, THRESHOLD)
Example #23
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.opt = opt
        self.direct_feature = DirectFeature(opt.model)
        self.feature_nums = self.direct_feature.get_feature_num()
        self.meta_embedding = MetaEmbedding(self.feature_nums, 50030)

        print_network(self.direct_feature)
        print_network(self.meta_embedding)

        # TODO: 这里学习率是不是可以调成 direct_feature 0.01 meta_embedding 0.1
        # self.optimizer = optim.SGD(chain(self.direct_feature.parameters(), self.meta_embedding.parameters()),
        #                            lr=0.01, momentum=0.9, weight_decay=0.0005)

        self.optimizer = optim.Adam(chain(self.direct_feature.parameters(), self.meta_embedding.parameters()),
                                   lr=0.01)

        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)

        # different weight for different classes
        self.criterionCE = nn.CrossEntropyLoss()
Example #24
0
 def initialize(self):
     self.config.sched = scheduler.get_scheduler()
Example #25
0
	def timedio(self, function, args=(), kwargs={}, timeout=30):
		"""Call a method with a failsafe timeout value."""
		sched = scheduler.get_scheduler()
		return sched.iotimeout(function, args, kwargs, timeout)
Example #26
0
def main():
    global args, config, best_prec1
    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.load(f)

    config = EasyDict(config['common'])
    config.save_path = os.path.dirname(args.config)

    rank, world_size = dist_init()

    # create model
    bn_group_size = config.model.kwargs.bn_group_size
    bn_var_mode = config.model.kwargs.get('bn_var_mode', 'L2')
    if bn_group_size == 1:
        bn_group = None
    else:
        assert world_size % bn_group_size == 0
        bn_group = simple_group_split(world_size, rank,
                                      world_size // bn_group_size)

    config.model.kwargs.bn_group = bn_group
    config.model.kwargs.bn_var_mode = (link.syncbnVarMode_t.L1 if bn_var_mode
                                       == 'L1' else link.syncbnVarMode_t.L2)
    model = model_entry(config.model)
    if rank == 0:
        print(model)

    model.cuda()

    if config.optimizer.type == 'FP16SGD' or config.optimizer.type == 'FusedFP16SGD':
        args.fp16 = True
    else:
        args.fp16 = False

    if args.fp16:
        # if you have modules that must use fp32 parameters, and need fp32 input
        # try use link.fp16.register_float_module(your_module)
        # if you only need fp32 parameters set cast_args=False when call this
        # function, then call link.fp16.init() before call model.half()
        if config.optimizer.get('fp16_normal_bn', False):
            print('using normal bn for fp16')
            link.fp16.register_float_module(link.nn.SyncBatchNorm2d,
                                            cast_args=False)
            link.fp16.register_float_module(torch.nn.BatchNorm2d,
                                            cast_args=False)
            link.fp16.init()
        model.half()

    model = DistModule(model, args.sync)

    # create optimizer
    opt_config = config.optimizer
    opt_config.kwargs.lr = config.lr_scheduler.base_lr
    if config.get('no_wd', False):
        param_group, type2num = param_group_no_wd(model)
        opt_config.kwargs.params = param_group
    else:
        opt_config.kwargs.params = model.parameters()

    optimizer = optim_entry(opt_config)

    # optionally resume from a checkpoint
    last_iter = -1
    best_prec1 = 0
    if args.load_path:
        if args.recover:
            best_prec1, last_iter = load_state(args.load_path,
                                               model,
                                               optimizer=optimizer)
        else:
            load_state(args.load_path, model)

    cudnn.benchmark = True

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    # augmentation
    aug = [
        transforms.RandomResizedCrop(config.augmentation.input_size),
        transforms.RandomHorizontalFlip()
    ]

    for k in config.augmentation.keys():
        assert k in [
            'input_size', 'test_resize', 'rotation', 'colorjitter', 'colorold'
        ]
    rotation = config.augmentation.get('rotation', 0)
    colorjitter = config.augmentation.get('colorjitter', None)
    colorold = config.augmentation.get('colorold', False)

    if rotation > 0:
        aug.append(transforms.RandomRotation(rotation))

    if colorjitter is not None:
        aug.append(transforms.ColorJitter(*colorjitter))

    aug.append(transforms.ToTensor())

    if colorold:
        aug.append(ColorAugmentation())

    aug.append(normalize)

    # train
    train_dataset = McDataset(config.train_root,
                              config.train_source,
                              transforms.Compose(aug),
                              fake=args.fake)

    # val
    val_dataset = McDataset(
        config.val_root, config.val_source,
        transforms.Compose([
            transforms.Resize(config.augmentation.test_resize),
            transforms.CenterCrop(config.augmentation.input_size),
            transforms.ToTensor(),
            normalize,
        ]), args.fake)

    train_sampler = DistributedGivenIterationSampler(
        train_dataset,
        config.lr_scheduler.max_iter,
        config.batch_size,
        last_iter=last_iter)
    val_sampler = DistributedSampler(val_dataset, round_up=False)

    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch_size,
                              shuffle=False,
                              num_workers=config.workers,
                              pin_memory=True,
                              sampler=train_sampler)

    val_loader = DataLoader(val_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            num_workers=config.workers,
                            pin_memory=True,
                            sampler=val_sampler)

    config.lr_scheduler['optimizer'] = optimizer.optimizer if isinstance(
        optimizer, FP16SGD) else optimizer
    config.lr_scheduler['last_iter'] = last_iter
    lr_scheduler = get_scheduler(config.lr_scheduler)

    if rank == 0:
        tb_logger = SummaryWriter(config.save_path + '/events')
        logger = create_logger('global_logger', config.save_path + '/log.txt')
        logger.info('args: {}'.format(pprint.pformat(args)))
        logger.info('config: {}'.format(pprint.pformat(config)))
    else:
        tb_logger = None

    if args.evaluate:
        if args.fusion_list is not None:
            validate(val_loader,
                     model,
                     fusion_list=args.fusion_list,
                     fuse_prob=args.fuse_prob)
        else:
            validate(val_loader, model)
        link.finalize()
        return

    train(train_loader, val_loader, model, optimizer, lr_scheduler,
          last_iter + 1, tb_logger)

    link.finalize()
    def load_config(self, config, cassandra_section):
        """Load the configuration of the section describe cassandra system
        Typically, named after [cassandra]
        """
        assert isinstance(config, ConfigParser.ConfigParser)
        assert cassandra_section in config.sections()

        options = config.options(cassandra_section)
        for option in options:
            [has_stg_name, stg_name] = self._if_has_stg_name(option)
            if has_stg_name:
                if 'num_workers' in option:
                    self.stg_num_workers_dict[stg_name] = config.getint(
                        cassandra_section, option)
                elif 'scheduler' in option:
                    print 'customerized scheduler:' + option
                    stg_scheduler_section = config.get(cassandra_section,
                                                       option)
                    self.stg_scheduler_generator_dict[stg_name] = \
                        lambda env: get_scheduler(env, config, stg_scheduler_section)
                elif 'schedule_resource' in option:
                    self.stg_schedule_resource_dict[stg_name] = config.get(
                        cassandra_section, option)
                elif 'type_name' in option:
                    self.stg_type_data_dict[stg_name] = {
                        self.type_name_kw: config.get(cassandra_section,
                                                      option)
                    }
                else:
                    print 'Warning: not handle stage config:' + option
            elif option == 'common_stg_scheduler':
                common_stg_scheduler_section = config.get(
                    cassandra_section, option)
                self.stg_scheduler_generator_dict[self.common_stg_name] = \
                    lambda env: get_scheduler(env, config, common_stg_scheduler_section)
                self._init_stg_scheduler_dict()
            elif option == 'common_stg_schedule_resource':
                self.stg_schedule_resource_dict[
                    self.common_stg_name] = config.get(cassandra_section,
                                                       option)
                self._init_stg_scheduler_dict()
            elif option == 'common_stg_num_workers':
                self.stg_num_workers_dict[
                    self.common_stg_name] = config.getint(
                        cassandra_section, option)
            elif option == 'node_token_sum':
                self.node_token_sum = config.getint(cassandra_section, option)
            elif option == 'pnode_type':
                self.pnode_type = config.get(cassandra_section, option)
            elif option == 'replication_strategy':
                self.replication_strategy = config.get(cassandra_section,
                                                       option)
            elif option == 'token_allocation_strategy':
                self.token_allocation_strategy = config.get(
                    cassandra_section, option)
            elif option == 'stage_monitor_interval':
                self.stage_monitor_interval = config.getfloat(
                    cassandra_section, option)
            elif option == 'unified_scheduler':
                unified_scheduler_section = config.get(cassandra_section,
                                                       option)
                self.unified_scheduler_generator = lambda env: get_scheduler(
                    env, config, unified_scheduler_section)
            else:
                raise ConfigError('Unknown option for cassandra:' + option)

            # build config for special stages, at this point, all the name has been read
            self._build_special_stage_param(config, cassandra_section)
Example #28
0
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(config, kwargs)
        self.config = config

        kargs = {}
        if 'SCALE' in config.DATA:
            scale = config.DATA.SCALE
            if isinstance(scale, int):
                min_size = scale
                max_size = int(min_size / 3 * 5)
            else:
                min_size, max_size = config.DATA.SCALE

            kargs = {
                'min_size': min_size,
                'max_size': max_size,
            }

        kargs.update({'box_nms_thresh': config.TEST.NMS_THRESH})

        # 多卡使用 SyncBN
        if is_distributed():
            kargs.update({'norm_layer': torch.nn.SyncBatchNorm})

        # 定义backbone和Faster RCNN模型
        if config.MODEL.BACKBONE is None or config.MODEL.BACKBONE.lower() in [
                'res50', 'resnet50'
        ]:
            # 默认是带fpn的resnet50
            self.detector = fasterrcnn_resnet50_fpn(pretrained=False, **kargs)

            in_features = self.detector.roi_heads.box_predictor.cls_score.in_features

            # replace the pre-trained head with a new one
            self.detector.roi_heads.box_predictor = FastRCNNPredictor(
                in_features, config.DATA.NUM_CLASSESS + 1)

        elif config.MODEL.BACKBONE.lower() in ['vgg16', 'vgg']:
            backbone = vgg16_backbone()
            self.detector = FasterRCNN(backbone,
                                       num_classes=config.DATA.NUM_CLASSESS +
                                       1,
                                       **kargs)

        elif config.MODEL.BACKBONE.lower() in ['res101', 'resnet101']:
            # 不带FPN的resnet101
            backbone = res101_backbone()
            self.detector = FasterRCNN(backbone,
                                       num_classes=config.DATA.NUM_CLASSESS +
                                       1,
                                       **kargs)

        elif config.MODEL.BACKBONE.lower() in ['res', 'resnet']:
            raise RuntimeError(
                f'backbone "{config.MODEL.BACKBONE}" is ambiguous, please specify layers.'
            )

        else:
            raise NotImplementedError(
                f'no such backbone: {config.MODEL.BACKBONE}')

        if opt.debug and is_first_gpu():
            print_network(self.detector)

        self.to(opt.device)
        # 多GPU支持
        if is_distributed():
            self.detector = torch.nn.parallel.DistributedDataParallel(
                self.detector,
                find_unused_parameters=False,
                device_ids=[opt.local_rank],
                output_device=opt.local_rank)
            # self.detector = torch.nn.parallel.DistributedDataParallel(self.detector, device_ids=[opt.local_rank], output_device=opt.local_rank)

        self.optimizer = get_optimizer(config, self.detector)
        self.scheduler = get_scheduler(config, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join('checkpoints', opt.tag)
Example #29
0
#!/usr/bin/python
import sys
import time
import datetime

import utils
import logger
import config
log = logger.get_logger(__name__)
conf = config.get_config()
import scheduler
schedule = scheduler.get_scheduler()
import slack
import smtp
import audio

# run all the input services
def run():
	# schedule module summary report
	for module in conf["modules"]:
		if not module["enabled"]: continue
		if "daily_digest" not in module: continue
		if module["daily_digest"]:
			schedule.add_job(smtp.module_digest,'cron',hour="23",minute="55",second=utils.randint(1,59),args=[module["module_id"]])
			log.info("["+module['module_id']+"] scheduling daily module digest")
	# schedule alert summary report
	if conf["output"]["email"]["alerts_digest"]: 
		log.info("scheduling daily alert digest")
		schedule.add_job(smtp.alerts_digest,'cron',hour="0",minute="55",args=[])
	# run slack bot
	if conf["input"]["slack"]["enabled"]: schedule.add_job(slack.run,'date',run_date=datetime.datetime.now())
Example #30
0
def main(local_rank, args):
    '''dist init'''
    rank, world_size = init_distributed(local_rank, args)

    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    opt = EasyDict(config)
    opt.world_size = world_size

    if rank == 0:
        mkdir(opt.result_path)
        mkdir(os.path.join(opt.result_path, 'tmp'))
        with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
            json.dump(vars(opt), opt_file, indent=2)
        logger = create_logger(os.path.join(opt.result_path, 'log.txt'))
        logger.info('opt: {}'.format(pprint.pformat(opt, indent=2)))

        writer = SummaryWriter(os.path.join(opt.result_path, 'tb'))
    else:
        logger = writer = None
    dist.barrier()

    random_seed(opt.manual_seed)
    # setting benchmark to True causes OOM in some cases
    if opt.get('cudnn', None) is not None:
        torch.backends.cudnn.deterministic = opt.cudnn.get(
            'deterministic', False)
        torch.backends.cudnn.benchmark = opt.cudnn.get('benchmark', False)

    # create model
    net = AVA_model(opt.model)
    net.cuda()
    net = DistributedDataParallel(net,
                                  device_ids=[local_rank],
                                  broadcast_buffers=False)

    if rank == 0:
        logger.info(net)
        logger.info(parameters_string(net))

    if not opt.get('evaluate', False):
        train_aug = opt.train.augmentation

        spatial_transform = [
            getattr(spatial_transforms, aug.type)(**aug.get('kwargs', {}))
            for aug in train_aug.spatial
        ]
        spatial_transform = spatial_transforms.Compose(spatial_transform)

        temporal_transform = getattr(
            temporal_transforms,
            train_aug.temporal.type)(**train_aug.temporal.get('kwargs', {}))

        train_data = ava.AVA(opt.train.root_path, opt.train.annotation_path,
                             spatial_transform, temporal_transform)

        train_sampler = DistributedSampler(train_data, round_down=True)

        train_loader = ava.AVADataLoader(train_data,
                                         batch_size=opt.train.batch_size,
                                         shuffle=False,
                                         num_workers=opt.train.get(
                                             'workers', 1),
                                         pin_memory=True,
                                         sampler=train_sampler,
                                         drop_last=True)

        if rank == 0:
            logger.info('# train data: {}'.format(len(train_data)))
            logger.info('train spatial aug: {}'.format(spatial_transform))
            logger.info('train temporal aug: {}'.format(temporal_transform))

            train_logger = Logger(os.path.join(opt.result_path, 'train.log'),
                                  ['epoch', 'loss', 'lr'])
            train_batch_logger = Logger(
                os.path.join(opt.result_path, 'train_batch.log'),
                ['epoch', 'batch', 'iter', 'loss', 'lr'])
        else:
            train_logger = train_batch_logger = None

        optim_opt = opt.train.optimizer
        sched_opt = opt.train.scheduler

        optimizer = getattr(optim, optim_opt.type)(net.parameters(),
                                                   lr=sched_opt.base_lr,
                                                   **optim_opt.kwargs)
        scheduler = get_scheduler(sched_opt, optimizer, opt.train.n_epochs,
                                  len(train_loader))

    val_aug = opt.val.augmentation

    transform_choices, total_choices = [], 1
    for aug in val_aug.spatial:
        kwargs_list = aug.get('kwargs', {})
        if not isinstance(kwargs_list, list):
            kwargs_list = [kwargs_list]
        cur_choices = [
            getattr(spatial_transforms, aug.type)(**kwargs)
            for kwargs in kwargs_list
        ]
        transform_choices.append(cur_choices)
        total_choices *= len(cur_choices)

    spatial_transform = []
    for choice_idx in range(total_choices):
        idx, transform = choice_idx, []
        for cur_choices in transform_choices:
            n_choices = len(cur_choices)
            cur_idx = idx % n_choices
            transform.append(cur_choices[cur_idx])
            idx = idx // n_choices
        spatial_transform.append(spatial_transforms.Compose(transform))

    temporal_transform = getattr(
        temporal_transforms,
        val_aug.temporal.type)(**val_aug.temporal.get('kwargs', {}))

    val_data = ava.AVAmulticrop(opt.val.root_path, opt.val.annotation_path,
                                spatial_transform, temporal_transform)

    val_sampler = DistributedSampler(val_data, round_down=False)

    val_loader = ava.AVAmulticropDataLoader(val_data,
                                            batch_size=opt.val.batch_size,
                                            shuffle=False,
                                            num_workers=opt.val.get(
                                                'workers', 1),
                                            pin_memory=True,
                                            sampler=val_sampler)

    val_logger = None
    if rank == 0:
        logger.info('# val data: {}'.format(len(val_data)))
        logger.info('val spatial aug: {}'.format(spatial_transform))
        logger.info('val temporal aug: {}'.format(temporal_transform))

        val_log_items = ['epoch']
        if opt.val.with_label:
            val_log_items.append('loss')
        if opt.val.get('eval_mAP', None) is not None:
            val_log_items.append('mAP')
        if len(val_log_items) > 1:
            val_logger = Logger(os.path.join(opt.result_path, 'val.log'),
                                val_log_items)

    if opt.get('pretrain', None) is not None:
        load_pretrain(opt.pretrain, net)

    begin_epoch = 1
    if opt.get('resume_path', None) is not None:
        if not os.path.isfile(opt.resume_path):
            opt.resume_path = os.path.join(opt.result_path, opt.resume_path)
        checkpoint = torch.load(
            opt.resume_path, map_location=lambda storage, loc: storage.cuda())

        begin_epoch = checkpoint['epoch'] + 1
        net.load_state_dict(checkpoint['state_dict'])
        if rank == 0:
            logger.info('Resumed from checkpoint {}'.format(opt.resume_path))

        if not opt.get('evaluate', False):
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            if rank == 0:
                logger.info(
                    'Also loaded optimizer and scheduler from checkpoint {}'.
                    format(opt.resume_path))

    criterion, act_func = getattr(losses,
                                  opt.loss.type)(**opt.loss.get('kwargs', {}))

    if opt.get('evaluate', False):  # evaluation mode
        val_epoch(begin_epoch - 1, val_loader, net, criterion, act_func, opt,
                  logger, val_logger, rank, world_size, writer)
    else:  # training and validation mode
        for e in range(begin_epoch, opt.train.n_epochs + 1):
            train_sampler.set_epoch(e)
            train_epoch(e, train_loader, net, criterion, optimizer, scheduler,
                        opt, logger, train_logger, train_batch_logger, rank,
                        world_size, writer)

            if e % opt.train.val_freq == 0:
                val_epoch(e, val_loader, net, criterion, act_func, opt, logger,
                          val_logger, rank, world_size, writer)

    if rank == 0:
        writer.close()
Example #31
0
	def initialize(self):
		self.config.sched = scheduler.get_scheduler()
Example #32
0
 def initialize(self):
     self.config.rtc = Linux.rtc.RTC()
     self.config.sched = scheduler.get_scheduler()
     self.config.interrupts = proc.interrupts.get_interrupts()
     self.config.irqstart = self.config.interrupts[8][0].count
Example #33
0
    def __init__(self, opt, logger=None):
        super(Model, self).__init__()
        self.opt = opt

        if opt.scale:
            min_size = opt.scale
            max_size = int(min_size / 3 * 5)
        else:
            min_size = 800
            max_size = 1333
            # anchor_sizes = ((16,), (32,), (64,), (128,), (512,)) # ,( 4,), (256,), (512,))
            # aspect_ratios = ((0.2, 0.5, 1.0, 2.0, 5.0),) * len(anchor_sizes)
            # rpn_anchor_generator = AnchorGenerator(
            #     anchor_sizes, aspect_ratios
            # )

        kargs = {
            'min_size': min_size,
            'max_size': max_size,
            'cascade_iou_thr': [0.5, 0.6, 0.7],
        }

        # 定义backbone和Faster RCNN模型
        if opt.backbone is None or opt.backbone.lower() in [
                'res50', 'resnet50'
        ]:
            # 默认是带fpn的resnet50
            self.detector = cascadercnn_resnet50_fpn(pretrained=False, **kargs)

            in_features = self.detector.roi_heads[
                0].box_predictor.cls_score.in_features

            # replace the pre-trained head with a new one
            self.detector.roi_heads[0].box_predictor = FastRCNNPredictor(
                in_features, opt.num_classes + 1)
            self.detector.roi_heads[1].box_predictor = FastRCNNPredictor(
                in_features, opt.num_classes + 1)
            self.detector.roi_heads[2].box_predictor = FastRCNNPredictor(
                in_features, opt.num_classes + 1)

        elif opt.backbone.lower() in ['vgg16', 'vgg']:
            backbone = vgg16_backbone()
            self.detector = CascadeRCNN(backbone,
                                        num_classes=opt.num_classes + 1,
                                        **kargs)

        elif opt.backbone.lower() in ['res101', 'resnet101']:
            # 不带FPN的resnet101
            backbone = res101_backbone()
            self.detector = CascadeRCNN(backbone,
                                        num_classes=opt.num_classes + 1,
                                        **kargs)

        elif opt.backbone.lower() in ['res', 'resnet']:
            raise RuntimeError(
                f'backbone "{opt.backbone}" is ambiguous, please specify layers.'
            )

        else:
            raise NotImplementedError(f'no such backbone: {opt.backbone}')

        print_network(self.detector)

        self.optimizer = get_optimizer(opt, self.detector)
        self.scheduler = get_scheduler(opt, self.optimizer)

        self.avg_meters = ExponentialMovingAverage(0.95)
        self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
Example #34
0
def pause_job(job_id):
    scheduler = get_scheduler()
    job = scheduler.get_job(job_id)
    new_time = job.next_run_time + datetime.timedelta(hours=1)
    job._modify(next_run_time=new_time)
    return True
Example #35
0
    def __init__(self, config_path, run_dir):
        self.config_path = coerce_to_path_and_check_exist(config_path)
        self.run_dir = coerce_to_path_and_create_dir(run_dir)
        self.logger = get_logger(self.run_dir, name="trainer")
        self.print_and_log_info(
            "Trainer initialisation: run directory is {}".format(run_dir))

        shutil.copy(self.config_path, self.run_dir)
        self.print_and_log_info("Config {} copied to run directory".format(
            self.config_path))

        with open(self.config_path) as fp:
            cfg = yaml.load(fp, Loader=yaml.FullLoader)

        if torch.cuda.is_available():
            type_device = "cuda"
            nb_device = torch.cuda.device_count()
        else:
            type_device = "cpu"
            nb_device = None
        self.device = torch.device(type_device)
        self.print_and_log_info("Using {} device, nb_device is {}".format(
            type_device, nb_device))

        # Datasets and dataloaders
        self.dataset_kwargs = cfg["dataset"]
        self.dataset_name = self.dataset_kwargs.pop("name")
        train_dataset = get_dataset(self.dataset_name)("train",
                                                       **self.dataset_kwargs)
        val_dataset = get_dataset(self.dataset_name)("val",
                                                     **self.dataset_kwargs)
        self.n_classes = train_dataset.n_classes
        self.is_val_empty = len(val_dataset) == 0
        self.print_and_log_info("Dataset {} instantiated with {}".format(
            self.dataset_name, self.dataset_kwargs))
        self.print_and_log_info(
            "Found {} classes, {} train samples, {} val samples".format(
                self.n_classes, len(train_dataset), len(val_dataset)))

        self.img_size = train_dataset.img_size
        self.batch_size = cfg["training"]["batch_size"]
        self.n_workers = cfg["training"].get("n_workers", 4)
        self.train_loader = DataLoader(train_dataset,
                                       batch_size=self.batch_size,
                                       num_workers=self.n_workers,
                                       shuffle=True)
        self.val_loader = DataLoader(val_dataset,
                                     batch_size=self.batch_size,
                                     num_workers=self.n_workers)
        self.print_and_log_info(
            "Dataloaders instantiated with batch_size={} and n_workers={}".
            format(self.batch_size, self.n_workers))

        self.n_batches = len(self.train_loader)
        self.n_iterations, self.n_epoches = cfg["training"].get(
            "n_iterations"), cfg["training"].get("n_epoches")
        assert not (self.n_iterations is not None
                    and self.n_epoches is not None)
        if self.n_iterations is not None:
            self.n_epoches = max(self.n_iterations // self.n_batches, 1)
        else:
            self.n_iterations = self.n_epoches * len(self.train_loader)

        # Model
        self.model_kwargs = cfg["model"]
        self.model_name = self.model_kwargs.pop("name")
        self.is_gmm = 'gmm' in self.model_name
        self.model = get_model(self.model_name)(
            self.train_loader.dataset, **self.model_kwargs).to(self.device)
        self.print_and_log_info("Using model {} with kwargs {}".format(
            self.model_name, self.model_kwargs))
        self.print_and_log_info('Number of trainable parameters: {}'.format(
            f'{count_parameters(self.model):,}'))
        self.n_prototypes = self.model.n_prototypes

        # Optimizer
        opt_params = cfg["training"]["optimizer"] or {}
        optimizer_name = opt_params.pop("name")
        cluster_kwargs = opt_params.pop('cluster', {})
        tsf_kwargs = opt_params.pop('transformer', {})
        self.optimizer = get_optimizer(optimizer_name)([
            dict(params=self.model.cluster_parameters(), **cluster_kwargs),
            dict(params=self.model.transformer_parameters(), **tsf_kwargs)
        ], **opt_params)
        self.model.set_optimizer(self.optimizer)
        self.print_and_log_info("Using optimizer {} with kwargs {}".format(
            optimizer_name, opt_params))
        self.print_and_log_info("cluster kwargs {}".format(cluster_kwargs))
        self.print_and_log_info("transformer kwargs {}".format(tsf_kwargs))

        # Scheduler
        scheduler_params = cfg["training"].get("scheduler", {}) or {}
        scheduler_name = scheduler_params.pop("name", None)
        self.scheduler_update_range = scheduler_params.pop(
            "update_range", "epoch")
        assert self.scheduler_update_range in ["epoch", "batch"]
        if scheduler_name == "multi_step" and isinstance(
                scheduler_params["milestones"][0], float):
            n_tot = self.n_epoches if self.scheduler_update_range == "epoch" else self.n_iterations
            scheduler_params["milestones"] = [
                round(m * n_tot) for m in scheduler_params["milestones"]
            ]
        self.scheduler = get_scheduler(scheduler_name)(self.optimizer,
                                                       **scheduler_params)
        self.cur_lr = self.scheduler.get_last_lr()[0]
        self.print_and_log_info("Using scheduler {} with parameters {}".format(
            scheduler_name, scheduler_params))

        # Pretrained / Resume
        checkpoint_path = cfg["training"].get("pretrained")
        checkpoint_path_resume = cfg["training"].get("resume")
        assert not (checkpoint_path is not None
                    and checkpoint_path_resume is not None)
        if checkpoint_path is not None:
            self.load_from_tag(checkpoint_path)
        elif checkpoint_path_resume is not None:
            self.load_from_tag(checkpoint_path_resume, resume=True)
        else:
            self.start_epoch, self.start_batch = 1, 1

        # Train metrics & check_cluster interval
        metric_names = ['time/img', 'loss']
        metric_names += [f'prop_clus{i}' for i in range(self.n_prototypes)]
        train_iter_interval = cfg["training"]["train_stat_interval"]
        self.train_stat_interval = train_iter_interval
        self.train_metrics = Metrics(*metric_names)
        self.train_metrics_path = self.run_dir / TRAIN_METRICS_FILE
        with open(self.train_metrics_path, mode="w") as f:
            f.write("iteration\tepoch\tbatch\t" +
                    "\t".join(self.train_metrics.names) + "\n")
        self.check_cluster_interval = cfg["training"]["check_cluster_interval"]

        # Val metrics & scores
        val_iter_interval = cfg["training"]["val_stat_interval"]
        self.val_stat_interval = val_iter_interval
        self.val_metrics = Metrics('loss_val')
        self.val_metrics_path = self.run_dir / VAL_METRICS_FILE
        with open(self.val_metrics_path, mode="w") as f:
            f.write("iteration\tepoch\tbatch\t" +
                    "\t".join(self.val_metrics.names) + "\n")

        self.val_scores = Scores(self.n_classes, self.n_prototypes)
        self.val_scores_path = self.run_dir / VAL_SCORES_FILE
        with open(self.val_scores_path, mode="w") as f:
            f.write("iteration\tepoch\tbatch\t" +
                    "\t".join(self.val_scores.names) + "\n")

        # Prototypes & Variances
        self.prototypes_path = coerce_to_path_and_create_dir(self.run_dir /
                                                             'prototypes')
        [
            coerce_to_path_and_create_dir(self.prototypes_path / f'proto{k}')
            for k in range(self.n_prototypes)
        ]
        if self.is_gmm:
            self.variances_path = coerce_to_path_and_create_dir(self.run_dir /
                                                                'variances')
            [
                coerce_to_path_and_create_dir(self.variances_path / f'var{k}')
                for k in range(self.n_prototypes)
            ]

        # Transformation predictions
        self.transformation_path = coerce_to_path_and_create_dir(
            self.run_dir / 'transformations')
        self.images_to_tsf = next(iter(
            self.train_loader))[0][:N_TRANSFORMATION_PREDICTIONS].to(
                self.device)
        for k in range(self.images_to_tsf.size(0)):
            out = coerce_to_path_and_create_dir(self.transformation_path /
                                                f'img{k}')
            convert_to_img(self.images_to_tsf[k]).save(out / 'input.png')
            [
                coerce_to_path_and_create_dir(out / f'tsf{k}')
                for k in range(self.n_prototypes)
            ]

        # Visdom
        viz_port = cfg["training"].get("visualizer_port")
        if viz_port is not None:
            from visdom import Visdom
            os.environ["http_proxy"] = ""
            self.visualizer = Visdom(
                port=viz_port,
                env=f'{self.run_dir.parent.name}_{self.run_dir.name}')
            self.visualizer.delete_env(
                self.visualizer.env)  # Clean env before plotting
            self.print_and_log_info(f"Visualizer initialised at {viz_port}")
        else:
            self.visualizer = None
            self.print_and_log_info("No visualizer initialized")
Example #36
0
 def initialize(self):
     self.sched = scheduler.get_scheduler()
     self.STOP = 0
     self._passed_timer = 0