def __init__(self, opt, logger=None): super(Model, self).__init__(config, kwargs) self.opt = opt # cfgfile = 'yolo-voc.cfg' # self.detector = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) # in_features = self.detector.roi_heads.box_predictor.cls_score.in_features # # # replace the pre-trained head with a new one # self.detector.roi_heads.box_predictor = FastRCNNPredictor(in_features, config.DATA.NUM_CLASSESS + 1) self.detector = yolov4(inference=True, n_classes=config.DATA.NUM_CLASSESS) # """ # 预训练模型 # """ # pretrained_dict = torch.load('pretrained/yolov4.pth') # self.detector.load_state_dict(pretrained_dict) self.yolov4loss = Yolo_loss(device=opt.device, batch=opt.batch_size) ##################### # Init weights ##################### # normal_init(self.detector) if opt.debug: print_network(self.detector) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join('checkpoints', opt.tag)
def load_config(self, config, hdfs_section): """ :param config: ConfigParser object :param hdfs_section: section describing the hdfs configuration """ assert hdfs_section in config.sections() options = config.options(hdfs_section) for option in options: if option == "namenode_handlers": self.namenode_handlers = config.getint(hdfs_section, option) elif option == "replica": self.replica = config.getint(hdfs_section, option) elif option == "namenode_scheduler": namenode_sched_section = config.get(hdfs_section, option) if namenode_sched_section == "None": self.namenode_scheduler_generator = None else: self.namenode_scheduler_generator = lambda env: get_scheduler( env, config, namenode_sched_section) elif option == "datanode": datanode_config_section = config.get(hdfs_section, option) self.datanode_conf.load_config(config, datanode_config_section) else: print "Warning: unknown option for hdfs:", option
def __init__(self, opt, logger=None): super(Model, self).__init__() self.opt = opt self.logger = logger # 根据YoloV2和YoloV3使用不同的配置文件 if opt.model == 'Yolo2': cfgfile = 'configs/yolo2-voc.cfg' elif opt.model == 'Yolo3': cfgfile = 'configs/yolo3-coco.cfg' # 初始化detector self.detector = Darknet(cfgfile, device=opt.device).to(opt.device) print_network(self.detector, logger=logger) # 在--load之前加载weights文件(可选) if opt.weights: utils.color_print('Load Yolo weights from %s.' % opt.weights, 3) self.detector.load_weights(opt.weights) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
def __init__(self, opt): super(Model, self).__init__() self.opt = opt self.classifier = Classifier(opt.model) #.cuda(device=opt.device) ##################### # Init weights ##################### # self.classifier.apply(weights_init) print_network(self.classifier) self.optimizer = get_optimizer(opt, self.classifier) self.scheduler = get_scheduler(opt, self.optimizer) # load networks # if opt.load: # pretrained_path = opt.load # self.load_network(self.classifier, 'G', opt.which_epoch, pretrained_path) # if self.training: # self.load_network(self.discriminitor, 'D', opt.which_epoch, pretrained_path) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag) # with open('datasets/class_weight.pkl', 'rb') as f: # class_weight = pickle.load(f, encoding='bytes') # class_weight = np.array(class_weight, dtype=np.float32) # class_weight = torch.from_numpy(class_weight).to(opt.device) # if opt.class_weight: # self.criterionCE = nn.CrossEntropyLoss(weight=class_weight) # else: self.criterionCE = nn.CrossEntropyLoss()
def __init__(self, opt, logger=None): super(Model, self).__init__() self.opt = opt self.logger = logger kargs = {} if opt.scale: min_size = opt.scale max_size = int(min_size / 3 * 4) kargs = { 'min_size': min_size, 'max_size': max_size, } kargs.update({'box_nms_thresh': nms_thresh}) # 定义backbone和Faster RCNN模型 if opt.backbone is None or opt.backbone.lower() in [ 'res50', 'resnet50' ]: # 默认是带fpn的resnet50 self.detector = fasterrcnn_resnet50_fpn(pretrained=False, **kargs) in_features = self.detector.roi_heads.box_predictor.cls_score.in_features # replace the pre-trained head with a new one self.detector.roi_heads.box_predictor = FastRCNNPredictor( in_features, opt.num_classes + 1) elif opt.backbone.lower() in ['vgg16', 'vgg']: backbone = vgg16_backbone() self.detector = FasterRCNN(backbone, num_classes=opt.num_classes + 1, **kargs) elif opt.backbone.lower() in ['res101', 'resnet101']: # 不带FPN的resnet101 backbone = res101_backbone() self.detector = FasterRCNN(backbone, num_classes=opt.num_classes + 1, **kargs) elif opt.backbone.lower() in ['res', 'resnet']: raise RuntimeError( f'backbone "{opt.backbone}" is ambiguous, please specify layers.' ) else: raise NotImplementedError(f'no such backbone: {opt.backbone}') print_network(self.detector) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
def __init__(self, opt): super(Model, self).__init__() self.opt = opt self.classifier = Classifier(opt.model) # self.classifier.apply(weights_init) # 初始化权重 print_network(self.classifier) self.optimizer = get_optimizer(opt, self.classifier) self.scheduler = get_scheduler(opt, self.optimizer)
def start_like_scheduler(campaign, api): logging.basicConfig() scheduler = get_scheduler() print "added" start = datetime.datetime.today().minute + 1 job = scheduler.add_job(update_likes_q, 'cron', minute=start, \ misfire_grace_time=None, args=(campaign.id,api,)) #scheduler.add_job(pause_job, 'cron', minute=4, hour="4,8,12,16", args=(job.id,)) campaign.job_id = job.id print "Job ID: {}".format(job.id) global_session.commit() return scheduler
def start_like_scheduler(campaign, api): logging.basicConfig() scheduler = get_scheduler() print "added" start = datetime.datetime.today().minute + 1 job = scheduler.add_job(update_likes_q, 'cron', minute=start, \ misfire_grace_time=None, args=(campaign.id,api,)) #scheduler.add_job(pause_job, 'cron', minute=4, hour="4,8,12,16", args=(job.id,)) campaign.job_id=job.id print "Job ID: {}".format(job.id) global_session.commit() return scheduler
def load_config(self, config, node_section): """ :param config: ConfigParser object :param node_section: section describing the node configuration """ assert node_section in config.sections() options = config.options(node_section) for option in options: if option == "cpu_freq": self.cpu_freq = config.getfloat(node_section, option) elif option == "num_cpus": self.num_cpus = config.getint(node_section, option) elif option == "disk_bandwidth": self.disk_bandwidth = config.getfloat(node_section, option) elif option == "num_disks": self.num_disks = config.getint(node_section, option) elif option == "network_bandwidth": self.network_bandwidth = config.getfloat(node_section, option) elif option == "num_links": self.num_links = config.getint(node_section, option) elif option == "cpu_scheduler": cpu_sched_section = config.get(node_section, option) self.cpu_scheduler_generator = lambda env: get_scheduler( env, config, cpu_sched_section) elif option == "io_scheduler": io_sched_section = config.get(node_section, option) self.io_scheduler_generator = lambda env: get_scheduler( env, config, io_sched_section) elif option == "network_scheduler": net_sched_section = config.get(node_section, option) self.network_scheduler_generator = lambda env: get_scheduler( env, config, net_sched_section) elif option == "resource_monitor_interval": self.resource_monitor_interval = config.getfloat( node_section, option) else: raise ConfigError("Unknown phy_node option: " + option)
def train_new_model(model, train_queue, valid_queue, test_queue): ori_model = model.module if args.distributed else model optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) drop_layers = ori_model.drop_layers() criterion = get_criterion(args.classes, args.label_smoothing) for epoch in range(args.epochs): scheduler.step() if args.warmup and epoch < args.warmup_epochs: lr = args.learning_rate * epoch / args.warmup_epochs + args.warmup_lr for param_group in optimizer.param_groups: param_group['lr'] = lr cond_logging('epoch %d lr %e', epoch, lr) else: lr = scheduler.get_lr()[0] cond_logging('epoch %d lr %e', epoch, lr) if args.distributed: train_queue.sampler.set_epoch(epoch) if args.epd: drop_rate = args.drop_rate * epoch / args.epochs else: drop_rate = args.drop_rate drop_rates = [drop_rate] * drop_layers if args.layerd: for i in range(drop_layers): drop_rates[i] = drop_rates[i] * (i + 1) / drop_layers ori_model.set_drop_rates(drop_rates) cond_logging('drop rates:') cond_logging(ori_model.drop_rates) #training train_acc, train_obj = train(train_queue, model, criterion, optimizer, lr, args.report_freq, args.world_size, args.distributed, args.local_rank) cond_logging('train acc %f', train_acc) #validation drop_rates = [0] * drop_layers ori_model.set_drop_rates(drop_rates) valid_acc, valid_obj = infer(valid_queue, model, criterion, args.report_freq, args.world_size, args.distributed, args.local_rank) cond_logging('valid acc %f', valid_acc) test_acc, test_obj = infer(test_queue, model, criterion, args.report_freq, args.world_size, args.distributed, args.local_rank) cond_logging('test acc %f', test_acc) return model
def __init__(self, opt): super(Model, self).__init__() self.opt = opt self.cleaner = FFA().to(device=opt.device) ##################### # Init weights ##################### # normal_init(self.cleaner) print_network(self.cleaner) self.g_optimizer = get_optimizer(opt, self.cleaner) self.scheduler = get_scheduler(opt, self.g_optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
def __init__(self, opt, logger=None): super(Model, self).__init__(config, kwargs) self.opt = opt self.detector = get_net().to(device=opt.device) ##################### # Init weights ##################### # normal_init(self.detector) if opt.debug: print_network(self.detector) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join('checkpoints', opt.tag)
def __init__(self, opt): super(Model, self).__init__() self.opt = opt self.detector = torchvision.models.detection.fasterrcnn_resnet50_fpn( pretrained=False) # self.detector = FasterRCNN_VGG() in_features = self.detector.roi_heads.box_predictor.cls_score.in_features # replace the pre-trained head with a new one self.detector.roi_heads.box_predictor = FastRCNNPredictor( in_features, opt.num_classes + 1) print_network(self.detector) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
def __init__(self, config, **kwargs): super(Model, self).__init__(config, kwargs) self.config = config # 根据YoloV2和YoloV3使用不同的配置文件 if config.MODEL.NAME == 'Yolo2': cfgfile = 'configs/networks/yolo2-voc.cfg' elif config.MODEL.NAME == 'Yolo3': cfgfile = 'configs/networks/yolo3-coco.cfg' # 初始化detector self.detector = Darknet(cfgfile, device=opt.device).to(opt.device) if opt.debug: print_network(self.detector) # 在--load之前加载weights文件(可选) if opt.load and opt.load[-2:] != 'pt': if is_first_gpu(): utils.color_print('Load Yolo weights from %s.' % opt.load, 3) self.detector.load_weights(opt.load) elif 'LOAD' in config.MODEL and config.MODEL.LOAD[-2:] != 'pt': if is_first_gpu(): utils.color_print( 'Load Yolo weights from %s.' % config.MODEL.LOAD, 3) self.detector.load_weights(config.MODEL.LOAD) self.to(opt.device) # 多GPU支持 if is_distributed(): self.detector = torch.nn.SyncBatchNorm.convert_sync_batchnorm( self.detector) self.detector = torch.nn.parallel.DistributedDataParallel( self.detector, find_unused_parameters=False, device_ids=[opt.local_rank], output_device=opt.local_rank) # self.detector = torch.nn.parallel.DistributedDataParallel(self.detector, device_ids=[opt.local_rank], output_device=opt.local_rank) self.optimizer = get_optimizer(config, self.detector) self.scheduler = get_scheduler(config, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join('checkpoints', opt.tag)
def __init__(self, opt): super(Model, self).__init__() self.opt = opt cfgfile = 'configs/yolov5x.yaml' self.detector = Yolo5(cfgfile) self.detector.hyp = hyp self.detector.gr = 1.0 self.detector.nc = opt.num_classes ##################### # Init weights ##################### # normal_init(self.detector) print_network(self.detector) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag) self.it = 0
def exec_train(config, train_data_loader, valid_data_loader, OUTPUT_DIR, fold, trained_epoch=0): # load model and make parallel device = torch.device('cuda:0') model = Model(config['model']).to(device) # model = get_model(config['model']).to(device) # model = torch.nn.DataParallel(model) # train setting trainable_params = [p for p in model.parameters() if p.requires_grad] optimizer = get_optimizer(config['train']['optimizer'], trainable_params) scheduler = get_scheduler(config['train']['scheduler'], optimizer) # log setting logger = Logger(model, optimizer, output_dir=OUTPUT_DIR, run_name=RUN_NAME, trained_epoch=trained_epoch, config=config, fold=fold + 1) # training for epoch in range(trained_epoch + 1, config['train']['epochs'] + 1): if config['general']['kfold'] < 0: print("\r [Epoch %d]" % epoch) else: print("\r [Fold %d : Epoch %d]" % (fold + 1, epoch)) train_epoch(model, train_data_loader, logger, optimizer) evaluate_epoch(model, valid_data_loader, logger, optimizer) if scheduler is not None: scheduler.step(logger.last_valid_loss) logger.finish_training()
def __init__(self, opt, logger=None): super(Model, self).__init__(config, kwargs) self.opt = opt cfgfile = 'configs/yolov5x.yaml' self.detector = Yolo5(cfgfile) self.detector.hyp = hyp self.detector.gr = 1.0 self.detector.nc = config.DATA.NUM_CLASSESS ##################### # Init weights ##################### # normal_init(self.detector) if opt.debug: print_network(self.detector) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join('checkpoints', opt.tag) self.it = 0
def __init__(self, opt): super(Model, self).__init__() self.opt = opt self.classifier = Classifier() ##################### # Init weights ##################### # self.classifier.apply(weights_init) print_network(self.classifier) self.optimizer = optim.Adam(self.classifier.parameters(), lr=opt.lr, betas=(0.95, 0.999)) self.scheduler = get_scheduler(opt, self.optimizer) # load networks # if opt.load: # pretrained_path = opt.load # self.load_network(self.classifier, 'G', opt.which_epoch, pretrained_path) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
def __init__(self, opt, logger=None): super(Model, self).__init__() self.opt = opt # cfgfile = 'yolo-voc.cfg' # self.detector = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) # in_features = self.detector.roi_heads.box_predictor.cls_score.in_features # # # replace the pre-trained head with a new one # self.detector.roi_heads.box_predictor = FastRCNNPredictor(in_features, opt.num_classes + 1) self.detector = Retina_50(opt.num_classes,pretrained=True) ##################### # Init weights ##################### # normal_init(self.detector) print_network(self.detector) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
def __init__(self, opt, logger=None): super(Model, self).__init__() self.opt = opt self.detector = SSDDetector(opt).to(device=opt.device) ##################### # Init weights ##################### # normal_init(self.detector) print_network(self.detector) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag) CENTER_VARIANCE = 0.1 SIZE_VARIANCE = 0.2 THRESHOLD = 0.5 self.target_transform = SSDTargetTransform( PriorBox(opt)(), CENTER_VARIANCE, SIZE_VARIANCE, THRESHOLD)
def __init__(self, opt): super(Model, self).__init__() self.opt = opt self.classifier = Classifier() #.cuda(device=opt.device) ##################### # Init weights ##################### # self.classifier.apply(weights_init) print_network(self.classifier) self.optimizer = get_optimizer(opt, self.classifier) self.scheduler = get_scheduler(opt, self.optimizer) # load networks # if opt.load: # pretrained_path = opt.load # self.load_network(self.classifier, 'G', opt.which_epoch, pretrained_path) # if self.training: # self.load_network(self.discriminitor, 'D', opt.which_epoch, pretrained_path) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
def __init__(self, config, **kwargs): super(Model, self).__init__(config, kwargs) self.config = config self.detector = SSDDetector(config).to(device=opt.device) ##################### # Init weights ##################### # normal_init(self.detector) if opt.debug: print_network(self.detector) self.optimizer = get_optimizer(config, self.detector) self.scheduler = get_scheduler(config, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join('checkpoints', opt.tag) CENTER_VARIANCE = 0.1 SIZE_VARIANCE = 0.2 THRESHOLD = 0.5 self.target_transform = SSDTargetTransform( PriorBox(config)(), CENTER_VARIANCE, SIZE_VARIANCE, THRESHOLD)
def __init__(self, opt): super(Model, self).__init__() self.opt = opt self.direct_feature = DirectFeature(opt.model) self.feature_nums = self.direct_feature.get_feature_num() self.meta_embedding = MetaEmbedding(self.feature_nums, 50030) print_network(self.direct_feature) print_network(self.meta_embedding) # TODO: 这里学习率是不是可以调成 direct_feature 0.01 meta_embedding 0.1 # self.optimizer = optim.SGD(chain(self.direct_feature.parameters(), self.meta_embedding.parameters()), # lr=0.01, momentum=0.9, weight_decay=0.0005) self.optimizer = optim.Adam(chain(self.direct_feature.parameters(), self.meta_embedding.parameters()), lr=0.01) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag) # different weight for different classes self.criterionCE = nn.CrossEntropyLoss()
def initialize(self): self.config.sched = scheduler.get_scheduler()
def timedio(self, function, args=(), kwargs={}, timeout=30): """Call a method with a failsafe timeout value.""" sched = scheduler.get_scheduler() return sched.iotimeout(function, args, kwargs, timeout)
def main(): global args, config, best_prec1 args = parser.parse_args() with open(args.config) as f: config = yaml.load(f) config = EasyDict(config['common']) config.save_path = os.path.dirname(args.config) rank, world_size = dist_init() # create model bn_group_size = config.model.kwargs.bn_group_size bn_var_mode = config.model.kwargs.get('bn_var_mode', 'L2') if bn_group_size == 1: bn_group = None else: assert world_size % bn_group_size == 0 bn_group = simple_group_split(world_size, rank, world_size // bn_group_size) config.model.kwargs.bn_group = bn_group config.model.kwargs.bn_var_mode = (link.syncbnVarMode_t.L1 if bn_var_mode == 'L1' else link.syncbnVarMode_t.L2) model = model_entry(config.model) if rank == 0: print(model) model.cuda() if config.optimizer.type == 'FP16SGD' or config.optimizer.type == 'FusedFP16SGD': args.fp16 = True else: args.fp16 = False if args.fp16: # if you have modules that must use fp32 parameters, and need fp32 input # try use link.fp16.register_float_module(your_module) # if you only need fp32 parameters set cast_args=False when call this # function, then call link.fp16.init() before call model.half() if config.optimizer.get('fp16_normal_bn', False): print('using normal bn for fp16') link.fp16.register_float_module(link.nn.SyncBatchNorm2d, cast_args=False) link.fp16.register_float_module(torch.nn.BatchNorm2d, cast_args=False) link.fp16.init() model.half() model = DistModule(model, args.sync) # create optimizer opt_config = config.optimizer opt_config.kwargs.lr = config.lr_scheduler.base_lr if config.get('no_wd', False): param_group, type2num = param_group_no_wd(model) opt_config.kwargs.params = param_group else: opt_config.kwargs.params = model.parameters() optimizer = optim_entry(opt_config) # optionally resume from a checkpoint last_iter = -1 best_prec1 = 0 if args.load_path: if args.recover: best_prec1, last_iter = load_state(args.load_path, model, optimizer=optimizer) else: load_state(args.load_path, model) cudnn.benchmark = True # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # augmentation aug = [ transforms.RandomResizedCrop(config.augmentation.input_size), transforms.RandomHorizontalFlip() ] for k in config.augmentation.keys(): assert k in [ 'input_size', 'test_resize', 'rotation', 'colorjitter', 'colorold' ] rotation = config.augmentation.get('rotation', 0) colorjitter = config.augmentation.get('colorjitter', None) colorold = config.augmentation.get('colorold', False) if rotation > 0: aug.append(transforms.RandomRotation(rotation)) if colorjitter is not None: aug.append(transforms.ColorJitter(*colorjitter)) aug.append(transforms.ToTensor()) if colorold: aug.append(ColorAugmentation()) aug.append(normalize) # train train_dataset = McDataset(config.train_root, config.train_source, transforms.Compose(aug), fake=args.fake) # val val_dataset = McDataset( config.val_root, config.val_source, transforms.Compose([ transforms.Resize(config.augmentation.test_resize), transforms.CenterCrop(config.augmentation.input_size), transforms.ToTensor(), normalize, ]), args.fake) train_sampler = DistributedGivenIterationSampler( train_dataset, config.lr_scheduler.max_iter, config.batch_size, last_iter=last_iter) val_sampler = DistributedSampler(val_dataset, round_up=False) train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True, sampler=train_sampler) val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True, sampler=val_sampler) config.lr_scheduler['optimizer'] = optimizer.optimizer if isinstance( optimizer, FP16SGD) else optimizer config.lr_scheduler['last_iter'] = last_iter lr_scheduler = get_scheduler(config.lr_scheduler) if rank == 0: tb_logger = SummaryWriter(config.save_path + '/events') logger = create_logger('global_logger', config.save_path + '/log.txt') logger.info('args: {}'.format(pprint.pformat(args))) logger.info('config: {}'.format(pprint.pformat(config))) else: tb_logger = None if args.evaluate: if args.fusion_list is not None: validate(val_loader, model, fusion_list=args.fusion_list, fuse_prob=args.fuse_prob) else: validate(val_loader, model) link.finalize() return train(train_loader, val_loader, model, optimizer, lr_scheduler, last_iter + 1, tb_logger) link.finalize()
def load_config(self, config, cassandra_section): """Load the configuration of the section describe cassandra system Typically, named after [cassandra] """ assert isinstance(config, ConfigParser.ConfigParser) assert cassandra_section in config.sections() options = config.options(cassandra_section) for option in options: [has_stg_name, stg_name] = self._if_has_stg_name(option) if has_stg_name: if 'num_workers' in option: self.stg_num_workers_dict[stg_name] = config.getint( cassandra_section, option) elif 'scheduler' in option: print 'customerized scheduler:' + option stg_scheduler_section = config.get(cassandra_section, option) self.stg_scheduler_generator_dict[stg_name] = \ lambda env: get_scheduler(env, config, stg_scheduler_section) elif 'schedule_resource' in option: self.stg_schedule_resource_dict[stg_name] = config.get( cassandra_section, option) elif 'type_name' in option: self.stg_type_data_dict[stg_name] = { self.type_name_kw: config.get(cassandra_section, option) } else: print 'Warning: not handle stage config:' + option elif option == 'common_stg_scheduler': common_stg_scheduler_section = config.get( cassandra_section, option) self.stg_scheduler_generator_dict[self.common_stg_name] = \ lambda env: get_scheduler(env, config, common_stg_scheduler_section) self._init_stg_scheduler_dict() elif option == 'common_stg_schedule_resource': self.stg_schedule_resource_dict[ self.common_stg_name] = config.get(cassandra_section, option) self._init_stg_scheduler_dict() elif option == 'common_stg_num_workers': self.stg_num_workers_dict[ self.common_stg_name] = config.getint( cassandra_section, option) elif option == 'node_token_sum': self.node_token_sum = config.getint(cassandra_section, option) elif option == 'pnode_type': self.pnode_type = config.get(cassandra_section, option) elif option == 'replication_strategy': self.replication_strategy = config.get(cassandra_section, option) elif option == 'token_allocation_strategy': self.token_allocation_strategy = config.get( cassandra_section, option) elif option == 'stage_monitor_interval': self.stage_monitor_interval = config.getfloat( cassandra_section, option) elif option == 'unified_scheduler': unified_scheduler_section = config.get(cassandra_section, option) self.unified_scheduler_generator = lambda env: get_scheduler( env, config, unified_scheduler_section) else: raise ConfigError('Unknown option for cassandra:' + option) # build config for special stages, at this point, all the name has been read self._build_special_stage_param(config, cassandra_section)
def __init__(self, config, **kwargs): super(Model, self).__init__(config, kwargs) self.config = config kargs = {} if 'SCALE' in config.DATA: scale = config.DATA.SCALE if isinstance(scale, int): min_size = scale max_size = int(min_size / 3 * 5) else: min_size, max_size = config.DATA.SCALE kargs = { 'min_size': min_size, 'max_size': max_size, } kargs.update({'box_nms_thresh': config.TEST.NMS_THRESH}) # 多卡使用 SyncBN if is_distributed(): kargs.update({'norm_layer': torch.nn.SyncBatchNorm}) # 定义backbone和Faster RCNN模型 if config.MODEL.BACKBONE is None or config.MODEL.BACKBONE.lower() in [ 'res50', 'resnet50' ]: # 默认是带fpn的resnet50 self.detector = fasterrcnn_resnet50_fpn(pretrained=False, **kargs) in_features = self.detector.roi_heads.box_predictor.cls_score.in_features # replace the pre-trained head with a new one self.detector.roi_heads.box_predictor = FastRCNNPredictor( in_features, config.DATA.NUM_CLASSESS + 1) elif config.MODEL.BACKBONE.lower() in ['vgg16', 'vgg']: backbone = vgg16_backbone() self.detector = FasterRCNN(backbone, num_classes=config.DATA.NUM_CLASSESS + 1, **kargs) elif config.MODEL.BACKBONE.lower() in ['res101', 'resnet101']: # 不带FPN的resnet101 backbone = res101_backbone() self.detector = FasterRCNN(backbone, num_classes=config.DATA.NUM_CLASSESS + 1, **kargs) elif config.MODEL.BACKBONE.lower() in ['res', 'resnet']: raise RuntimeError( f'backbone "{config.MODEL.BACKBONE}" is ambiguous, please specify layers.' ) else: raise NotImplementedError( f'no such backbone: {config.MODEL.BACKBONE}') if opt.debug and is_first_gpu(): print_network(self.detector) self.to(opt.device) # 多GPU支持 if is_distributed(): self.detector = torch.nn.parallel.DistributedDataParallel( self.detector, find_unused_parameters=False, device_ids=[opt.local_rank], output_device=opt.local_rank) # self.detector = torch.nn.parallel.DistributedDataParallel(self.detector, device_ids=[opt.local_rank], output_device=opt.local_rank) self.optimizer = get_optimizer(config, self.detector) self.scheduler = get_scheduler(config, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join('checkpoints', opt.tag)
#!/usr/bin/python import sys import time import datetime import utils import logger import config log = logger.get_logger(__name__) conf = config.get_config() import scheduler schedule = scheduler.get_scheduler() import slack import smtp import audio # run all the input services def run(): # schedule module summary report for module in conf["modules"]: if not module["enabled"]: continue if "daily_digest" not in module: continue if module["daily_digest"]: schedule.add_job(smtp.module_digest,'cron',hour="23",minute="55",second=utils.randint(1,59),args=[module["module_id"]]) log.info("["+module['module_id']+"] scheduling daily module digest") # schedule alert summary report if conf["output"]["email"]["alerts_digest"]: log.info("scheduling daily alert digest") schedule.add_job(smtp.alerts_digest,'cron',hour="0",minute="55",args=[]) # run slack bot if conf["input"]["slack"]["enabled"]: schedule.add_job(slack.run,'date',run_date=datetime.datetime.now())
def main(local_rank, args): '''dist init''' rank, world_size = init_distributed(local_rank, args) with open(args.config) as f: config = yaml.load(f, Loader=yaml.FullLoader) opt = EasyDict(config) opt.world_size = world_size if rank == 0: mkdir(opt.result_path) mkdir(os.path.join(opt.result_path, 'tmp')) with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file: json.dump(vars(opt), opt_file, indent=2) logger = create_logger(os.path.join(opt.result_path, 'log.txt')) logger.info('opt: {}'.format(pprint.pformat(opt, indent=2))) writer = SummaryWriter(os.path.join(opt.result_path, 'tb')) else: logger = writer = None dist.barrier() random_seed(opt.manual_seed) # setting benchmark to True causes OOM in some cases if opt.get('cudnn', None) is not None: torch.backends.cudnn.deterministic = opt.cudnn.get( 'deterministic', False) torch.backends.cudnn.benchmark = opt.cudnn.get('benchmark', False) # create model net = AVA_model(opt.model) net.cuda() net = DistributedDataParallel(net, device_ids=[local_rank], broadcast_buffers=False) if rank == 0: logger.info(net) logger.info(parameters_string(net)) if not opt.get('evaluate', False): train_aug = opt.train.augmentation spatial_transform = [ getattr(spatial_transforms, aug.type)(**aug.get('kwargs', {})) for aug in train_aug.spatial ] spatial_transform = spatial_transforms.Compose(spatial_transform) temporal_transform = getattr( temporal_transforms, train_aug.temporal.type)(**train_aug.temporal.get('kwargs', {})) train_data = ava.AVA(opt.train.root_path, opt.train.annotation_path, spatial_transform, temporal_transform) train_sampler = DistributedSampler(train_data, round_down=True) train_loader = ava.AVADataLoader(train_data, batch_size=opt.train.batch_size, shuffle=False, num_workers=opt.train.get( 'workers', 1), pin_memory=True, sampler=train_sampler, drop_last=True) if rank == 0: logger.info('# train data: {}'.format(len(train_data))) logger.info('train spatial aug: {}'.format(spatial_transform)) logger.info('train temporal aug: {}'.format(temporal_transform)) train_logger = Logger(os.path.join(opt.result_path, 'train.log'), ['epoch', 'loss', 'lr']) train_batch_logger = Logger( os.path.join(opt.result_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'lr']) else: train_logger = train_batch_logger = None optim_opt = opt.train.optimizer sched_opt = opt.train.scheduler optimizer = getattr(optim, optim_opt.type)(net.parameters(), lr=sched_opt.base_lr, **optim_opt.kwargs) scheduler = get_scheduler(sched_opt, optimizer, opt.train.n_epochs, len(train_loader)) val_aug = opt.val.augmentation transform_choices, total_choices = [], 1 for aug in val_aug.spatial: kwargs_list = aug.get('kwargs', {}) if not isinstance(kwargs_list, list): kwargs_list = [kwargs_list] cur_choices = [ getattr(spatial_transforms, aug.type)(**kwargs) for kwargs in kwargs_list ] transform_choices.append(cur_choices) total_choices *= len(cur_choices) spatial_transform = [] for choice_idx in range(total_choices): idx, transform = choice_idx, [] for cur_choices in transform_choices: n_choices = len(cur_choices) cur_idx = idx % n_choices transform.append(cur_choices[cur_idx]) idx = idx // n_choices spatial_transform.append(spatial_transforms.Compose(transform)) temporal_transform = getattr( temporal_transforms, val_aug.temporal.type)(**val_aug.temporal.get('kwargs', {})) val_data = ava.AVAmulticrop(opt.val.root_path, opt.val.annotation_path, spatial_transform, temporal_transform) val_sampler = DistributedSampler(val_data, round_down=False) val_loader = ava.AVAmulticropDataLoader(val_data, batch_size=opt.val.batch_size, shuffle=False, num_workers=opt.val.get( 'workers', 1), pin_memory=True, sampler=val_sampler) val_logger = None if rank == 0: logger.info('# val data: {}'.format(len(val_data))) logger.info('val spatial aug: {}'.format(spatial_transform)) logger.info('val temporal aug: {}'.format(temporal_transform)) val_log_items = ['epoch'] if opt.val.with_label: val_log_items.append('loss') if opt.val.get('eval_mAP', None) is not None: val_log_items.append('mAP') if len(val_log_items) > 1: val_logger = Logger(os.path.join(opt.result_path, 'val.log'), val_log_items) if opt.get('pretrain', None) is not None: load_pretrain(opt.pretrain, net) begin_epoch = 1 if opt.get('resume_path', None) is not None: if not os.path.isfile(opt.resume_path): opt.resume_path = os.path.join(opt.result_path, opt.resume_path) checkpoint = torch.load( opt.resume_path, map_location=lambda storage, loc: storage.cuda()) begin_epoch = checkpoint['epoch'] + 1 net.load_state_dict(checkpoint['state_dict']) if rank == 0: logger.info('Resumed from checkpoint {}'.format(opt.resume_path)) if not opt.get('evaluate', False): optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) if rank == 0: logger.info( 'Also loaded optimizer and scheduler from checkpoint {}'. format(opt.resume_path)) criterion, act_func = getattr(losses, opt.loss.type)(**opt.loss.get('kwargs', {})) if opt.get('evaluate', False): # evaluation mode val_epoch(begin_epoch - 1, val_loader, net, criterion, act_func, opt, logger, val_logger, rank, world_size, writer) else: # training and validation mode for e in range(begin_epoch, opt.train.n_epochs + 1): train_sampler.set_epoch(e) train_epoch(e, train_loader, net, criterion, optimizer, scheduler, opt, logger, train_logger, train_batch_logger, rank, world_size, writer) if e % opt.train.val_freq == 0: val_epoch(e, val_loader, net, criterion, act_func, opt, logger, val_logger, rank, world_size, writer) if rank == 0: writer.close()
def initialize(self): self.config.rtc = Linux.rtc.RTC() self.config.sched = scheduler.get_scheduler() self.config.interrupts = proc.interrupts.get_interrupts() self.config.irqstart = self.config.interrupts[8][0].count
def __init__(self, opt, logger=None): super(Model, self).__init__() self.opt = opt if opt.scale: min_size = opt.scale max_size = int(min_size / 3 * 5) else: min_size = 800 max_size = 1333 # anchor_sizes = ((16,), (32,), (64,), (128,), (512,)) # ,( 4,), (256,), (512,)) # aspect_ratios = ((0.2, 0.5, 1.0, 2.0, 5.0),) * len(anchor_sizes) # rpn_anchor_generator = AnchorGenerator( # anchor_sizes, aspect_ratios # ) kargs = { 'min_size': min_size, 'max_size': max_size, 'cascade_iou_thr': [0.5, 0.6, 0.7], } # 定义backbone和Faster RCNN模型 if opt.backbone is None or opt.backbone.lower() in [ 'res50', 'resnet50' ]: # 默认是带fpn的resnet50 self.detector = cascadercnn_resnet50_fpn(pretrained=False, **kargs) in_features = self.detector.roi_heads[ 0].box_predictor.cls_score.in_features # replace the pre-trained head with a new one self.detector.roi_heads[0].box_predictor = FastRCNNPredictor( in_features, opt.num_classes + 1) self.detector.roi_heads[1].box_predictor = FastRCNNPredictor( in_features, opt.num_classes + 1) self.detector.roi_heads[2].box_predictor = FastRCNNPredictor( in_features, opt.num_classes + 1) elif opt.backbone.lower() in ['vgg16', 'vgg']: backbone = vgg16_backbone() self.detector = CascadeRCNN(backbone, num_classes=opt.num_classes + 1, **kargs) elif opt.backbone.lower() in ['res101', 'resnet101']: # 不带FPN的resnet101 backbone = res101_backbone() self.detector = CascadeRCNN(backbone, num_classes=opt.num_classes + 1, **kargs) elif opt.backbone.lower() in ['res', 'resnet']: raise RuntimeError( f'backbone "{opt.backbone}" is ambiguous, please specify layers.' ) else: raise NotImplementedError(f'no such backbone: {opt.backbone}') print_network(self.detector) self.optimizer = get_optimizer(opt, self.detector) self.scheduler = get_scheduler(opt, self.optimizer) self.avg_meters = ExponentialMovingAverage(0.95) self.save_dir = os.path.join(opt.checkpoint_dir, opt.tag)
def pause_job(job_id): scheduler = get_scheduler() job = scheduler.get_job(job_id) new_time = job.next_run_time + datetime.timedelta(hours=1) job._modify(next_run_time=new_time) return True
def __init__(self, config_path, run_dir): self.config_path = coerce_to_path_and_check_exist(config_path) self.run_dir = coerce_to_path_and_create_dir(run_dir) self.logger = get_logger(self.run_dir, name="trainer") self.print_and_log_info( "Trainer initialisation: run directory is {}".format(run_dir)) shutil.copy(self.config_path, self.run_dir) self.print_and_log_info("Config {} copied to run directory".format( self.config_path)) with open(self.config_path) as fp: cfg = yaml.load(fp, Loader=yaml.FullLoader) if torch.cuda.is_available(): type_device = "cuda" nb_device = torch.cuda.device_count() else: type_device = "cpu" nb_device = None self.device = torch.device(type_device) self.print_and_log_info("Using {} device, nb_device is {}".format( type_device, nb_device)) # Datasets and dataloaders self.dataset_kwargs = cfg["dataset"] self.dataset_name = self.dataset_kwargs.pop("name") train_dataset = get_dataset(self.dataset_name)("train", **self.dataset_kwargs) val_dataset = get_dataset(self.dataset_name)("val", **self.dataset_kwargs) self.n_classes = train_dataset.n_classes self.is_val_empty = len(val_dataset) == 0 self.print_and_log_info("Dataset {} instantiated with {}".format( self.dataset_name, self.dataset_kwargs)) self.print_and_log_info( "Found {} classes, {} train samples, {} val samples".format( self.n_classes, len(train_dataset), len(val_dataset))) self.img_size = train_dataset.img_size self.batch_size = cfg["training"]["batch_size"] self.n_workers = cfg["training"].get("n_workers", 4) self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, num_workers=self.n_workers, shuffle=True) self.val_loader = DataLoader(val_dataset, batch_size=self.batch_size, num_workers=self.n_workers) self.print_and_log_info( "Dataloaders instantiated with batch_size={} and n_workers={}". format(self.batch_size, self.n_workers)) self.n_batches = len(self.train_loader) self.n_iterations, self.n_epoches = cfg["training"].get( "n_iterations"), cfg["training"].get("n_epoches") assert not (self.n_iterations is not None and self.n_epoches is not None) if self.n_iterations is not None: self.n_epoches = max(self.n_iterations // self.n_batches, 1) else: self.n_iterations = self.n_epoches * len(self.train_loader) # Model self.model_kwargs = cfg["model"] self.model_name = self.model_kwargs.pop("name") self.is_gmm = 'gmm' in self.model_name self.model = get_model(self.model_name)( self.train_loader.dataset, **self.model_kwargs).to(self.device) self.print_and_log_info("Using model {} with kwargs {}".format( self.model_name, self.model_kwargs)) self.print_and_log_info('Number of trainable parameters: {}'.format( f'{count_parameters(self.model):,}')) self.n_prototypes = self.model.n_prototypes # Optimizer opt_params = cfg["training"]["optimizer"] or {} optimizer_name = opt_params.pop("name") cluster_kwargs = opt_params.pop('cluster', {}) tsf_kwargs = opt_params.pop('transformer', {}) self.optimizer = get_optimizer(optimizer_name)([ dict(params=self.model.cluster_parameters(), **cluster_kwargs), dict(params=self.model.transformer_parameters(), **tsf_kwargs) ], **opt_params) self.model.set_optimizer(self.optimizer) self.print_and_log_info("Using optimizer {} with kwargs {}".format( optimizer_name, opt_params)) self.print_and_log_info("cluster kwargs {}".format(cluster_kwargs)) self.print_and_log_info("transformer kwargs {}".format(tsf_kwargs)) # Scheduler scheduler_params = cfg["training"].get("scheduler", {}) or {} scheduler_name = scheduler_params.pop("name", None) self.scheduler_update_range = scheduler_params.pop( "update_range", "epoch") assert self.scheduler_update_range in ["epoch", "batch"] if scheduler_name == "multi_step" and isinstance( scheduler_params["milestones"][0], float): n_tot = self.n_epoches if self.scheduler_update_range == "epoch" else self.n_iterations scheduler_params["milestones"] = [ round(m * n_tot) for m in scheduler_params["milestones"] ] self.scheduler = get_scheduler(scheduler_name)(self.optimizer, **scheduler_params) self.cur_lr = self.scheduler.get_last_lr()[0] self.print_and_log_info("Using scheduler {} with parameters {}".format( scheduler_name, scheduler_params)) # Pretrained / Resume checkpoint_path = cfg["training"].get("pretrained") checkpoint_path_resume = cfg["training"].get("resume") assert not (checkpoint_path is not None and checkpoint_path_resume is not None) if checkpoint_path is not None: self.load_from_tag(checkpoint_path) elif checkpoint_path_resume is not None: self.load_from_tag(checkpoint_path_resume, resume=True) else: self.start_epoch, self.start_batch = 1, 1 # Train metrics & check_cluster interval metric_names = ['time/img', 'loss'] metric_names += [f'prop_clus{i}' for i in range(self.n_prototypes)] train_iter_interval = cfg["training"]["train_stat_interval"] self.train_stat_interval = train_iter_interval self.train_metrics = Metrics(*metric_names) self.train_metrics_path = self.run_dir / TRAIN_METRICS_FILE with open(self.train_metrics_path, mode="w") as f: f.write("iteration\tepoch\tbatch\t" + "\t".join(self.train_metrics.names) + "\n") self.check_cluster_interval = cfg["training"]["check_cluster_interval"] # Val metrics & scores val_iter_interval = cfg["training"]["val_stat_interval"] self.val_stat_interval = val_iter_interval self.val_metrics = Metrics('loss_val') self.val_metrics_path = self.run_dir / VAL_METRICS_FILE with open(self.val_metrics_path, mode="w") as f: f.write("iteration\tepoch\tbatch\t" + "\t".join(self.val_metrics.names) + "\n") self.val_scores = Scores(self.n_classes, self.n_prototypes) self.val_scores_path = self.run_dir / VAL_SCORES_FILE with open(self.val_scores_path, mode="w") as f: f.write("iteration\tepoch\tbatch\t" + "\t".join(self.val_scores.names) + "\n") # Prototypes & Variances self.prototypes_path = coerce_to_path_and_create_dir(self.run_dir / 'prototypes') [ coerce_to_path_and_create_dir(self.prototypes_path / f'proto{k}') for k in range(self.n_prototypes) ] if self.is_gmm: self.variances_path = coerce_to_path_and_create_dir(self.run_dir / 'variances') [ coerce_to_path_and_create_dir(self.variances_path / f'var{k}') for k in range(self.n_prototypes) ] # Transformation predictions self.transformation_path = coerce_to_path_and_create_dir( self.run_dir / 'transformations') self.images_to_tsf = next(iter( self.train_loader))[0][:N_TRANSFORMATION_PREDICTIONS].to( self.device) for k in range(self.images_to_tsf.size(0)): out = coerce_to_path_and_create_dir(self.transformation_path / f'img{k}') convert_to_img(self.images_to_tsf[k]).save(out / 'input.png') [ coerce_to_path_and_create_dir(out / f'tsf{k}') for k in range(self.n_prototypes) ] # Visdom viz_port = cfg["training"].get("visualizer_port") if viz_port is not None: from visdom import Visdom os.environ["http_proxy"] = "" self.visualizer = Visdom( port=viz_port, env=f'{self.run_dir.parent.name}_{self.run_dir.name}') self.visualizer.delete_env( self.visualizer.env) # Clean env before plotting self.print_and_log_info(f"Visualizer initialised at {viz_port}") else: self.visualizer = None self.print_and_log_info("No visualizer initialized")
def initialize(self): self.sched = scheduler.get_scheduler() self.STOP = 0 self._passed_timer = 0