def __init__(self, args): self.batch_size = cfg.TRAIN.CONFIG.BATCH_SIZE self.gpu_num = cfg.TRAIN.CONFIG.GPU_NUM self.num_workers = cfg.DATA_LOADER.NUM_THREADS self.log_dir = cfg.MODEL.PATH.EVALUATION_DIR self.is_training = False self.cls_thresh = float(args.cls_threshold) self.eval_interval_secs = args.eval_interval_secs self.restore_model_path = args.restore_model_path self.eval = True # save dir datetime_str = str(datetime.datetime.now()) self.log_dir = os.path.join(self.log_dir, self.restore_model_path, datetime_str) if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) self.log_file = open(os.path.join(self.log_dir, 'log_train.txt'), 'w') self.log_file.write(str(args) + '\n') self._log_string('**** Saving Evaluation results to the path %s ****' % self.log_dir) # dataset dataset_func = choose_dataset() self.dataset = dataset_func('loading', split=args.split, img_list=args.img_list, is_training=self.is_training, workers_num=self.num_workers) self.dataset_iter = self.dataset.load_batch(self.batch_size * self.gpu_num) self._log_string('**** Dataset length is %d ****' % len(self.dataset)) self.val_size = len(self.dataset) # model list self.model_func = choose_model() self.model_list, self.pred_list, self.placeholders = self._build_model_list( ) # feeddict self.feeddict_producer = FeedDictCreater(self.dataset_iter, self.model_list, self.batch_size) # evaluation tools self.last_eval_model_path = None self.last_best_model = None self.last_best_result = -1 self.saver = tf.train.Saver()
def __init__(self, args): self.batch_size = cfg.TRAIN.CONFIG.BATCH_SIZE self.gpu_num = cfg.TRAIN.CONFIG.GPU_NUM self.num_workers = cfg.DATA_LOADER.NUM_THREADS self.log_dir = cfg.MODEL.PATH.EVALUATION_DIR self.is_training = False self.cls_thresh = float(args.cls_threshold) self.eval_interval_secs = args.eval_interval_secs self.restore_model_path = args.restore_model_path # save dir self.log_dir = args.restore_model_path[0:args.restore_model_path. find('/ckpt')] self.logger = create_logger(os.path.join(self.log_dir, 'log_eval.txt')) self.logger.info(str(args) + '\n') self.result_dir = os.path.join(self.log_dir, 'eval') self.logger.info('**** Saving Evaluation results to the path %s ****' % self.result_dir) # dataset dataset_func = choose_dataset() self.dataset = dataset_func('loading', split=args.split, img_list=args.img_list, is_training=self.is_training) self.dataloader = DataLoader(self.dataset, batch_size=self.batch_size * self.gpu_num, shuffle=False, num_workers=self.num_workers, worker_init_fn=my_worker_init_fn, collate_fn=self.dataset.load_batch) self.logger.info('**** Dataset length is %d ****' % len(self.dataset)) self.val_size = len(self.dataset) # model self.model_func = choose_model() self.model = self.model_func(self.batch_size, self.is_training) self.model = self.model.cuda() # tensorboard self.tb_log = SummaryWriter( log_dir=os.path.join(self.result_dir, 'tensorboard'))
def __init__(self, args): self.batch_size = cfg.TRAIN.CONFIG.BATCH_SIZE self.gpu_num = cfg.TRAIN.CONFIG.GPU_NUM self.num_workers = cfg.DATA_LOADER.NUM_THREADS self.log_dir = cfg.MODEL.PATH.CHECKPOINT_DIR self.max_iteration = cfg.TRAIN.CONFIG.MAX_ITERATIONS self.total_epochs = cfg.TRAIN.CONFIG.TOTAL_EPOCHS self.checkpoint_interval = cfg.TRAIN.CONFIG.CHECKPOINT_INTERVAL self.summary_interval = cfg.TRAIN.CONFIG.SUMMARY_INTERVAL self.trainable_param_prefix = cfg.TRAIN.CONFIG.TRAIN_PARAM_PREFIX self.trainable_loss_prefix = cfg.TRAIN.CONFIG.TRAIN_LOSS_PREFIX if args.output_dir is not None: self.log_dir = args.output_dir self.restore_model_path = args.restore_model_path self.is_training = True # gpu_num self.gpu_num = min(self.gpu_num, torch.cuda.device_count()) # save dir datetime_str = str(datetime.datetime.now()) datetime_str = datetime_str[0:datetime_str.find(' ')] + '_' + datetime_str[datetime_str.find(' ')+1:] self.log_dir = os.path.join(self.log_dir, datetime_str) if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) self.logger = create_logger(os.path.join(self.log_dir, 'log_train.txt')) self.logger.info(str(args) + '\n') self.logger.info('**** Saving models to the path %s ****' % self.log_dir) self.logger.info('**** Saving configure file in %s ****' % self.log_dir) os.system('cp \"%s\" \"%s\"' % (args.cfg, self.log_dir)) self.ckpt_dir = os.path.join(self.log_dir, 'ckpt') os.mkdir(self.ckpt_dir) # dataset dataset_func = choose_dataset() self.dataset = dataset_func('loading', split=args.split, img_list=args.img_list, is_training=self.is_training) self.dataloader = DataLoader(self.dataset, batch_size=self.batch_size*self.gpu_num, shuffle=True, num_workers=self.num_workers, worker_init_fn=my_worker_init_fn, collate_fn=self.dataset.load_batch) self.logger.info('**** Dataset length is %d ****' % len(self.dataset)) # models self.model_func = choose_model() self.model = self.model_func(self.batch_size, self.is_training) self.model = self.model.cuda() # tensorboard self.tb_log = SummaryWriter(log_dir=os.path.join(self.log_dir, 'tensorboard')) # optimizer self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.SOLVER.BASE_LR) self.lr_scheduler = LRScheduler(self.optimizer) # load from checkpoint start_epoch = it = 0 if args.restore_model_path is not None: it, start_epoch = self.model.load_params_with_optimizer(args.restore_model_path, to_cpu=False, optimizer=self.optimizer, logger=self.logger) self.start_epoch = start_epoch self.it = it if self.gpu_num > 1: self.logger.info("Use %d GPUs!" % self.gpu_num) self.model = torch.nn.DataParallel(self.model)
def __init__(self, args): self.batch_size = cfg.TRAIN.CONFIG.BATCH_SIZE self.gpu_num = cfg.TRAIN.CONFIG.GPU_NUM self.num_workers = cfg.DATA_LOADER.NUM_THREADS self.log_dir = cfg.MODEL.PATH.CHECKPOINT_DIR self.max_iteration = cfg.TRAIN.CONFIG.MAX_ITERATIONS self.checkpoint_interval = cfg.TRAIN.CONFIG.CHECKPOINT_INTERVAL self.summary_interval = cfg.TRAIN.CONFIG.SUMMARY_INTERVAL self.trainable_param_prefix = cfg.TRAIN.CONFIG.TRAIN_PARAM_PREFIX self.trainable_loss_prefix = cfg.TRAIN.CONFIG.TRAIN_LOSS_PREFIX self.restore_model_path = args.restore_model_path self.is_training = True # gpu_num self.gpu_num = min(self.gpu_num, len(self._get_available_gpu_num())) # save dir datetime_str = str(datetime.datetime.now()) self.log_dir = os.path.join(self.log_dir, datetime_str) if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) self.log_file = open(os.path.join(self.log_dir, 'log_train.txt'), 'w') self.log_file.write(str(args) + '\n') self._log_string('**** Saving models to the path %s ****' % self.log_dir) self._log_string('**** Saving configure file in %s ****' % self.log_dir) os.system('cp \"%s\" \"%s\"' % (args.cfg, self.log_dir)) # dataset dataset_func = choose_dataset() self.dataset = dataset_func('loading', split=args.split, img_list=args.img_list, is_training=self.is_training, workers_num=self.num_workers) self.dataset_iter = self.dataset.load_batch(self.batch_size * self.gpu_num) self._log_string('**** Dataset length is %d ****' % len(self.dataset)) # optimizer with tf.device('/cpu:0'): self.global_step = tf.contrib.framework.get_or_create_global_step() self.bn_decay = get_bn_decay(self.global_step) self.learning_rate = get_learning_rate(self.global_step) if cfg.SOLVER.TYPE == 'SGD': self.optimizer = tf.train.MomentumOptimizer( self.learning_rate, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == 'Adam': self.optimizer = tf.train.AdamOptimizer(self.learning_rate) # models self.model_func = choose_model() self.model_list, self.tower_grads, self.total_loss_gpu, self.losses_list, self.params, self.extra_update_ops = self._build_model_list( ) tf.summary.scalar('total_loss', self.total_loss_gpu) # feeddict self.feeddict_producer = FeedDictCreater(self.dataset_iter, self.model_list, self.batch_size) # print(self.tower_grads) with tf.device('/gpu:0'): self.grads = average_gradients(self.tower_grads) self.update_op = [ self.optimizer.apply_gradients(zip(self.grads, self.params), global_step=self.global_step) ] self.update_op.extend(self.extra_update_ops) self.train_op = tf.group(*self.update_op) # tensorflow training ops gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1, allow_growth=True) config = tf.ConfigProto( gpu_options=gpu_options, device_count={ "GPU": self.gpu_num, }, allow_soft_placement=True, ) self.sess = tf.Session(config=config) self.saver = tf.train.Saver() self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter( os.path.join(self.log_dir, 'train'), self.sess.graph) # initialize model self._initialize_model()
parser.add_argument('--split', default='training', help='Dataset split: training/testing') parser.add_argument('--img_list', default='val', help='train/val/trainval/test list') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() cfg_from_file(os.path.join(ROOT_DIR, '..', args.cfg)) if args.img_list == 'test': # if test, no groundtruth available cfg.TEST.WITH_GT = False cfg.TRAIN.AUGMENTATIONS.MIXUP.OPEN = False if args.img_list == 'val': # if val, no mixup dataset cfg.TRAIN.AUGMENTATIONS.MIXUP.OPEN = False dataset_func = choose_dataset() dataset = dataset_func('preprocessing', split=args.split, img_list=args.img_list, is_training=False) dataset.preprocess_batch()