def train(training_dbs, validation_db, start_iter=0): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # defining tensorboard writer tensorboard = Tensorboard('logs') print("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) out_train = nnet.train(**training) if display and iteration % display == 0: for idX, eleX in enumerate( ["training", "focal", "pull", "push", "regr"]): print("{} loss at iteration {}: {}".format( eleX, iteration, out_train[idX].item())) tensorboard.log_scalar('training/{} loss'.format(eleX), out_train[idX].item(), iteration) del out_train if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) tensorboard.log_scalar('validation/loss', validation_loss.item(), iteration) if iteration % (val_iter * 2) == 0: kp_detection(validation_db, nnet, "./cache/", debug=False, subset_val=True, TB_obj=tensorboard, TB_iter=iteration) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # closing tensorboard writer tensorboard.close() # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(training_dbs, validation_db, start_iter=0): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss, focal_loss, grouping_loss, region_loss, regr_loss = nnet.train( **training) if display and iteration % display == 0: print( "iter {}, all: {:.4f}, focal: {:.4f}, grouping:{:.4f}, region: {:.4f}, regr: {:.4f}" .format(iteration, training_loss.item(), focal_loss.item(), grouping_loss.item(), region_loss.item(), regr_loss.item())) del training_loss, focal_loss, grouping_loss, region_loss, regr_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(training_dbs, validation_db, start_iter=0): # 从json文件读取参数 learning_rate = system_configs.learning_rate # 学习率 max_iteration = system_configs.max_iter # 最大迭代次数 pretrained_model = system_configs.pretrain # 预训练模型 snapshot = system_configs.snapshot # 训练snapshot次就保存一次模型 val_iter = system_configs.val_iter # 每隔几步验证一次 display = system_configs.display # 每训练display次就显示一次loss decay_rate = system_configs.decay_rate # 衰减 stepsize = system_configs.stepsize # 步长 ??? # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue( system_configs.prefetch_size ) # prefetch_size:预取数据量。Queue()表示多核运算中,队列的长度最大为prefetch_size validation_queue = Queue(5) # 表示队列长度最大为5 # queues storing pinned data for training,队列存储固定数据以进行训练 pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) # sample_data = importlib.import_module( data_file).sample_data # 将data_file导入, # allocating resources for parallel reading, 为并行读取分配资源 training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) # 它调用init_parallel_jobs函数创建多进程,期间调用prefetch_data预取数据,期间调用sample_data函数。这些操作中涉及到了数据增强、各种groundtruth的生成等, if val_iter: # 验证阶段数据增强=False validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() # 信号量,可以用于控制线程数并发数 validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() # 锁住线程 validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) # 参数元组 training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) # 创建锁页线程 training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True # 子线程,在start之前使用,默认为False, true表示,无需等待子线程结束,主线程结束就结束 validation_pin_thread.start() # 子线程开始执行 print("building model...") nnet = NetworkFactory(training_dbs[0]) # 搭建网络对象 if pretrained_model is not None: # 判断是否用预训练模型 if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) # 加载预训练模型的参数 if start_iter: # 假如start_iter为0,就算了,否则执行 learning_rate /= (decay_rate**(start_iter // stepsize) ) # 根据start_iter来计算学习率 nnet.load_params(start_iter) # 根据命令行输入的参数,加载对应模型的参数,说白了就是从指定位置开始训练 nnet.set_lr(learning_rate) # 上面计算了学习率,此处是将学习率加载到模型中 print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # 从头迭代时候直接将cfg文件中的学习率加载进去 print("training start...") nnet.cuda() # 网络转化成GPU nnet.train_mode() # 训练模式 with stdout_to_tqdm( ) as save_stdout: # Tqdm 是 Python 进度条库,可以在 Python 长循环中添加一个进度提示信息用法:tqdm(iterator) for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training) if display and iteration % display == 0: # 每迭代display次打印一次training loss print("training loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) del training_loss, focal_loss, pull_loss, push_loss, regr_loss #, cls_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: # 每迭代val_iter次打印一次validation loss nnet.eval_mode() # 验证模式 validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) nnet.train_mode() # 验证结束后转换为训练模式 if iteration % snapshot == 0: # 每过snapshot次就保存模型 nnet.save_params(iteration) if iteration % stepsize == 0: # 每过stepsize就衰减一次学习率 learning_rate /= decay_rate nnet.set_lr(learning_rate) # 将衰减后的学习率加载至模型 # sending signal to kill the thread training_pin_semaphore.release() # 发信号,杀线程 validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: # 终止数据获取过程 training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(training_dbs, validation_db, start_iter=0): # train.py--> here # training_db = [MSCOCO x 4] and use the dataset specified by "trainval2014" # validation_db is a MSCOCO instance whose configs should firstly check in file CenterNet-104.json # start_iter should check args.star_iter it should be 0 learning_rate = system_configs.learning_rate # 0.00025 max_iteration = system_configs.max_iter # 480000 pretrained_model = system_configs.pretrain # None snapshot = system_configs.snapshot # 5000 val_iter = system_configs.val_iter # 500 display = system_configs.display # 5 decay_rate = system_configs.decay_rate # 10 stepsize = system_configs.stepsize # 450000 # all above hyperparameters should first check CenterNet-104.py to see , # then check config.py # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) # prefetch_size = 6 you can find this number in CenterNet.json validation_queue = Queue(5) # Queue is for torch.multiprocessing module to share data, manipulate data, exchange data, operate data. # since torch.multiprocessing function can't return value. so the operation is using Queue # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # queue.Queue is for threading to share data # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) # sample.coco # and there is a coco.py in directory sample sample_data = importlib.import_module(data_file).sample_data # importlib.import_module(data_file) means to import the sample.coco.py # and there is a function named sample_data in that file. # so sample_data means the function in sample/coco.py # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) # training_dbs is a list of four MSCOCO instance, and MSCOCO instance is used for loading annotation data # the training_queue is a queue of 6 # sample_data is a function # four thread each thread load a batch of data.images, heatmaps, location in flattened image, fractions part of keypoints # training_tasks is a list of torch.multiprocessing.Process objects, # so when each Process object.start() the original images and annotation files will be processed into # the formula accord with input shape to the network # data for training will be stored in training_queue if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) # data for validation will be stored in validation_queue. training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() # class threading.Semaphore([value]) # values是一个内部计数,values默认是1,如果小于0,则会抛出 ValueError 异常,可以用于控制线程数并发数 # here semaphore use default value so is 1 # Semaphore 是 Python 内置模块 threading 中的一个类 # Semaphore 管理一个计数器,每调用一次 acquire() 方法, # 计数器就减一,每调用一次 release() 方法,计数器就加一。 # 计时器的值默认为 1 ,计数器的值不能小于 0, # 当计数器的值为 0 时,调用 acquire() 的线程就会等待,直到 release() 被调用。 # 因此,可以利用这个特性来控制线程数量 # 代码示例。 # from threading import Thread, Semaphore # import time # # # def test(a): # #打印线程的名字 # print(t.name) # print(a) # time.sleep(2) # #释放 semaphore # sem.release() # # #设置计数器的值为 5 # sem = Semaphore(5) # for i in range(10): # #获取一个 semaphore # sem.acquire() # t = Thread(target=test, args=(i, )) # t.start() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) # training_queue is images, heatmaps,location regressions # pinned_training_queue hasn't initialized yet, it may be initialized in later line using function pin_memory # training_pin_semaphore is a counter training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) # Python Thread类表示在单独的控制线程中运行的活动。有两种方法可以指定这种活动: # 给构造函数传递回调对象: # https://blog.csdn.net/drdairen/article/details/60962439 # target is a function # args is inputs for the function # the function pin_memory move the data into GPU and read them one batch after another each time training_pin_thread.daemon = True # daemon的使用场景是:你需要一个始终运行的进程,用来监控其他服务的运行情况, # 或者发送心跳包或者类似的东西,你创建了这个进程都就不用管它了, # 他会随着主线程的退出出而退出了。 # so this line make the thread of reading data from CPU to GPU and read them batch by batch always alive # in the whole training stage training_pin_thread.start() # validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() # above four lines move validation data from CPU to GPU and read in batch by batch print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: # the CenterNet-104.json set pretrained model to None so these code skip if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: # start_iter is 0 learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # set the learning rate to 0.00025 print("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training) if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) del training_loss, focal_loss, pull_loss, push_loss, regr_loss #, cls_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(start_iter=20150): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize #vis = visdom.Visdom() # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("building model...") nnet = NetworkFactory(train_dataloader) #nnet = nnet.cuda() #nnet = nn.DataParallel(nnet).cuda() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") change_feature(pretrained_model) nnet.load_pretrained_params( "./MatrixNetAnchorsResnet50_48LayerRange_640isize/nnet/MatrixNetAnchors/MatrixNetAnchors_50_modified.pkl" ) #params = torch.load(pretrained_model) #nnet.load_state_dict({k.replace('module.',''):v for k,v in params['state_dict'].items()}) if start_iter: #learning_rate /= (decay_rate ** (start_iter // stepsize)) #print(learning_rate) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') nnet.cuda() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): loss_total = 0 nnet.train_mode() for index, (ls1, ls_msk) in enumerate(test_dataloader): training_loss = nnet.train(ls1, ls_msk) #print(training_loss) loss_total = loss_total + training_loss test_loss = 0 nnet.eval_mode() with torch.no_grad(): for index, (ls1, ls_msk) in enumerate(test_dataloader): test_iter_loss = nnet.validate(ls1, ls_msk) test_loss = test_loss + test_iter_loss print('epoch train loss = %f, epoch test loss = %f' % (loss_total / len(test_dataloader), test_loss / len(test_dataloader))) if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, loss_total.item())) test_loss_iter = test_loss / len(test_dataloader) del loss_total del test_loss if test_loss_iter < 0.0009: nnet.save_params(iteration) if iteration % snapshot == 0: nnet.save_params(iteration) #test() if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate)
def train(training_dbs, validation_db, start_iter=0, debug=False): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot # val_iter =s system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize training_size = len(training_dbs[0].db_inds) training_queue = Queue(system_configs.prefetch_size) # validation_queue = Queue(5) pinned_training_queue = queue.Queue(system_configs.prefetch_size) # pinned_validation_queue = queue.Queue(5) data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data training_tasks = init_parallel_jobs( training_dbs, training_queue, sample_data, True, debug) # if val_iter: # validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() # validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() # validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() # validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) # validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) # validation_pin_thread.daemon = True # validation_pin_thread.start() print("building model...") nnet = NetworkFactory(training_dbs[0]) # if pretrained_model is not None: # if not os.path.exists(pretrained_model): # raise ValueError("pretrained model does not exist") # print("loading from pretrained model") # nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate ** (start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format(start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") nnet.cuda() nnet.train_mode() avg_loss = AverageMeter() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss = nnet.train(**training) avg_loss.update(training_loss.item()) if display and iteration % display == 0: print("training loss at iteration {}: {:.6f} ({:.6f})".format( iteration, training_loss.item(), avg_loss.avg)) del training_loss # if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: # nnet.eval_mode() # validation = pinned_validation_queue.get(block=True) # validation_loss = nnet.validate(**validation) # print("validation loss at iteration {}: {}".format(iteration, validation_loss.item())) # nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % 100 == 0: nnet.save_params(-1) avg_loss = AverageMeter() if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() # validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate()
def train(training_dbs, validation_db, start_iter=0, freeze=False): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize batch_size = system_configs.batch_size # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) # 5 validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) # 5 pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) # "sample.coco" sample_data = importlib.import_module(data_file).sample_data # print(type(sample_data)) # function # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() print("building model...") nnet = NetworkFactory(flag=True) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") nnet.cuda() nnet.train_mode() header = None metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) metric_logger.add_meter( 'class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) with stdout_to_tqdm() as save_stdout: for iteration in metric_logger.log_every(tqdm(range( start_iter + 1, max_iteration + 1), file=save_stdout, ncols=67), print_freq=10, header=header): training = pinned_training_queue.get(block=True) viz_split = 'train' save = True if (display and iteration % display == 0) else False (set_loss, loss_dict) \ = nnet.train(iteration, save, viz_split, **training) (loss_dict_reduced, loss_dict_reduced_unscaled, loss_dict_reduced_scaled, loss_value) = loss_dict metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) metric_logger.update(class_error=loss_dict_reduced['class_error']) metric_logger.update(lr=learning_rate) del set_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() viz_split = 'val' save = True validation = pinned_validation_queue.get(block=True) (val_set_loss, val_loss_dict) \ = nnet.validate(iteration, save, viz_split, **validation) (loss_dict_reduced, loss_dict_reduced_unscaled, loss_dict_reduced_scaled, loss_value) = val_loss_dict print('[VAL LOG]\t[Saving training and evaluating images...]') metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) metric_logger.update( class_error=loss_dict_reduced['class_error']) metric_logger.update(lr=learning_rate) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) if iteration % (training_size // batch_size) == 0: metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(training_dbs, validation_db, start_iter=0): learning_rate = system_configs.learning_rate # 学习率 max_iteration = system_configs.max_iter # 最大迭代次数 pretrained_model = system_configs.pretrain # 预训练模型 snapshot = system_configs.snapshot # 每隔snapshot进行参数保存 val_iter = system_configs.val_iter # 每隔val_iter进行验证操作 display = system_configs.display # 每隔display次进行loss输出 decay_rate = system_configs.decay_rate # 学习率衰减 stepsize = system_configs.stepsize # 学习率衰减的起始步 # 获取每个数据集的大小 training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # 队列存储数据用于训练 training_queue = Queue(system_configs.prefetch_size) validation_queue = Queue(5) # 队列存储固定数据用于训练 pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # 加载数据采样函数 data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # 分配资源用于平行读取 training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) # 验证数据集的并行 training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() # 创建模型 print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize) ) # 根据当前的起始点来计算学习率 nnet.load_params(start_iter) # 加载起始点处的参数 nnet.set_lr(learning_rate) # 设置学习率 print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # 设置学习率 # 开始训练 print("training start...") nnet.cuda() # GPU nnet.train_mode() # 训练模式 with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) # 训练数据 training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) # 获取loss if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) del training_loss, focal_loss, pull_loss, push_loss, regr_loss # 删除操作 # 验证操作 if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() # 验证模型 validation = pinned_validation_queue.get(block=True) # 验证集 validation_loss = nnet.validate(**validation) # 验证误差 print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) nnet.train_mode() # 切换到训练模式 if iteration % snapshot == 0: nnet.save_params(iteration) # 保存参数 if iteration % stepsize == 0: learning_rate /= decay_rate # 学习率进行衰减 nnet.set_lr(learning_rate) # 训练结束后发送信号杀死线程 training_pin_semaphore.release() validation_pin_semaphore.release() # 终止数据预读进程 for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
def train(training_dbs, validation_db, validation_db_2, tb, suffix, cfg_file, es, start_iter): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) #validation_2_size = len(validation_db_2.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) validation_queue = Queue(5) #validation_2_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) #pinned_validation_2_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) #validation_2_tasks = init_parallel_jobs([validation_db_2], validation_2_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() #validation_2_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() #validation_2_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() # validation_2_pin_args = (validation_2_queue, pinned_validation_2_queue, validation_2_pin_semaphore) # validation_2_pin_thread = threading.Thread(target=pin_memory, args=validation_2_pin_args) # validation_2_pin_thread.daemon = True # validation_2_pin_thread.start() print("building model...") nnet = NetworkFactory(training_dbs[0]) #, suffix) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) if es: early_stopping = EarlyStopping(patience=100, verbose=True) print("training start...") nnet.cuda() #nnet.cpu() #if suffix == 104: # net = model_104(training_dbs[0]) # tb.add_graph(net, torch.rand(2, 3, 511, 511))#, torch.FloatTensor(training_dbs[0].db_inds)) #elif suffix == 52: # net = model_52(training_dbs[0]) # dummy_input = torch.randn(2, 3, 511, 511) # tb.add_graph(net, dummy_input) #else: # return #tb.close() ##### Model's Warm-up ##### nnet.eval_mode() input = cv2.imread(training_dbs[0].image_file(0)) start_time = time.time() detections = kp_detection(input, nnet, score_min=0.5) end_time = time.time() infer_time = end_time - start_time print("\n##################################################") print("Warm-up + Inference Time: " + str(infer_time * 1000) + "ms") print("##################################################") ########################### ##### Model's Inference Time ##### input = cv2.imread(training_dbs[0].image_file(0)) start_time = time.time() detections = kp_detection(input, nnet, score_min=0.5) end_time = time.time() infer_time = end_time - start_time print("\n##################################################") print("Inference Time: " + str(infer_time * 1000) + "ms") print("##################################################") ################################## result_dir = system_configs.result_dir result_dir = os.path.join(result_dir, str("Training_Validation"), str("val2017"), str(suffix)) #if suffix is not None: # result_dir = os.path.join(result_dir, suffix) make_dirs([result_dir]) nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) #start_time = time.time() training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) #end_time = time.time() #infer_time = end_time - start_time #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training) #print("\nTotal Time per Iteration:" + str(infer_time) + "ms") #tb.add_scalar('Total Time (ms) vs Iteration', infer_time * 1000, iteration) if display and iteration % display == 0: print("\ntraining loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) tb.add_scalar('Training Loss vs Iteration', training_loss.item(), iteration) tb.add_scalar('Focal Loss vs Iteration', focal_loss.item(), iteration) tb.add_scalar('Pull Loss vs Iteration', pull_loss.item(), iteration) tb.add_scalar('Push Loss vs Iteration', push_loss.item(), iteration) tb.add_scalar('Offset Loss vs Iteration', regr_loss.item(), iteration) #tb.add_scalar('Class Loss vs Iteration', cls_loss.item(), iteration) del training_loss, focal_loss, pull_loss, push_loss, regr_loss #, cls_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("\n##################################################") print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) print("##################################################") tb.add_scalar('Validation Loss vs Iteration', validation_loss.item(), iteration) if es: early_stopping(validation_loss, iteration, nnet, cfg_file) nnet.train_mode() epoch = len(training_dbs[0].db_inds) // system_configs.batch_size #print(epoch) if iteration % epoch == 0: # Enter every epoch nnet.eval_mode() stats = kp_detection_train(validation_db_2, nnet, result_dir) map_avg = stats[0] map_50 = stats[1] map_75 = stats[2] map_small = stats[3] map_medium = stats[4] map_large = stats[5] mar_1 = stats[6] mar_10 = stats[7] mar_100 = stats[8] mar_small = stats[9] mar_medium = stats[10] mar_large = stats[11] tb.add_scalar('Average mAP vs Epoch', map_avg, iteration / epoch) tb.add_scalar('mAP (IoU 0.5) vs Epoch', map_50, iteration / epoch) tb.add_scalar('mAP (IoU 0.75) vs Epoch', map_75, iteration / epoch) tb.add_scalar('mAP (Area = Small) vs Epoch', map_small, iteration / epoch) tb.add_scalar('mAP (Area = Medium) vs Epoch', map_medium, iteration / epoch) tb.add_scalar('mAP (Area = Large) vs Epoch', map_large, iteration / epoch) tb.add_scalar('mAR (Max Detection = 1) vs Epoch', mar_1, iteration / epoch) tb.add_scalar('mAR (Max Detection = 10) vs Epoch', mar_10, iteration / epoch) tb.add_scalar('mAR (Max Detection = 100) vs Epoch', mar_100, iteration / epoch) tb.add_scalar('mAR (Area = Small) vs Epoch', mar_small, iteration / epoch) tb.add_scalar('mAR (Area = Medium) vs Epoch', mar_medium, iteration / epoch) tb.add_scalar('mAR (Area = Large) vs Epoch', mar_large, iteration / epoch) nnet.train_mode() if es and early_stopping.early_stop: print("Early stopping") break if not es: if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()