def test(db, system_config, model, args): split = args.split testiter = args.testiter debug = args.debug suffix = args.suffix result_dir = system_config.result_dir result_dir = os.path.join(result_dir, str(testiter), split) if suffix is not None: result_dir = os.path.join(result_dir, suffix) make_dirs([result_dir]) test_iter = system_config.max_iter if testiter is None else testiter print("loading parameters at iteration: {}".format(test_iter)) print("building neural network...") nnet = NetworkFactory(system_config, model, test=True) print("loading parameters...") nnet.load_params(test_iter) nnet.cuda() nnet.eval_mode() test_func(system_config, db, nnet, result_dir, debug=debug)
def test(db, system_config, model, args): split = args.split testiter = args.testiter debug = args.debug suffix = args.suffix # 输出的文件夹result_dir+testiter+split result_dir = system_config.result_dir result_dir = os.path.join(result_dir, str(testiter), split) # 后缀的添加 if suffix is not None: result_dir = os.path.join(result_dir, suffix) # 创建文件夹 make_dirs([result_dir]) # 赋值test_iter 如果没有传入就用预设值 test_iter = system_config.max_iter if testiter is None else testiter print("loading parameters at iteration: {}".format(test_iter)) # 构建神经网络 print("building neural network...") nnet = NetworkFactory(system_config, model) print("loading parameters...") nnet.load_params(test_iter) nnet.cuda() nnet.eval_mode() test_func(system_config, db, nnet, result_dir, debug=debug)
def test(db, system_config, model, args): print("\033[0;35;46m" + "{}".format(" ") * 100 + "\033[0m") print("\033[0;33m " + "现在位置:{}/{}/.{}".format(os.getcwd(), os.path.basename(__file__), sys._getframe().f_code.co_name) + "\033[0m") split = args.split testiter = args.testiter debug = args.debug suffix = args.suffix print("\033[0;36m " + "{}".format("测试用的配置参数") + "\033[0m") print("\033[1;36m " + "split: {}, testiter:{}, debug:{}, suffix:{}".format( split, testiter, debug, suffix) + "\033[0m") # 输出的文件夹result_dir+testiter+split result_dir = system_config.result_dir result_dir = os.path.join(result_dir, str(testiter), split) # 后缀的添加 if suffix is not None: result_dir = os.path.join(result_dir, suffix) # 创建文件夹 make_dirs([result_dir]) print("\033[1;36m " + "result_dir:{}".format(result_dir) + "\033[0m") # 赋值test_iter 如果没有传入就用预设值 test_iter = system_config.max_iter if testiter is None else testiter print("\033[1;36m " + "loading parameters at iteration(在迭代次数为test_iter的缓存文件中加载参数): " + "\033[0m" + "{}".format(test_iter)) # 构建神经网络 print("\033[0;36m " + "{}".format("building neural network(创建神经网络)...") + "\033[0m") nnet = NetworkFactory(system_config, model) print("\033[0;36m " + "{}".format("loading parameters(加载参数)...") + "\033[0m") nnet.load_params(test_iter) nnet.cuda() nnet.eval_mode() test_func(system_config, db, nnet, result_dir, debug=debug)
def train(training_dbs, validation_db, system_config, model, args): # reading arguments from command start_iter = args.start_iter distributed = args.distributed world_size = args.world_size initialize = args.initialize gpu = args.gpu rank = args.rank # reading arguments from json file batch_size = system_config.batch_size learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain stepsize = system_config.stepsize snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate stepsize = system_config.stepsize print("Process {}: building model...".format(rank)) nnet = NetworkFactory(system_config, model, distributed=distributed, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # queues storing data for training training_queue = Queue(system_config.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading training_tasks = init_parallel_jobs(system_config, training_dbs, training_queue, data_sampling_func, True) if val_iter: validation_tasks = init_parallel_jobs(system_config, [validation_db], validation_queue, data_sampling_func, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("Process {}: loading from pretrained model".format(rank)) nnet.load_pretrained_params(pretrained_model) if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.set_lr(learning_rate) print( "Process {}: training starts from iteration {} with learning_rate {}" .format(rank, start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) if rank == 0: print("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): # training = pinned_training_queue.get(block=True) # training_loss = nnet.train(**training) # # if display and iteration % display == 0: # print("Process {}: training loss at iteration {}: {}".format(rank, iteration, training_loss.item())) # del training_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("Process {}: validation loss at iteration {}: {}".format( rank, iteration, validation_loss.item())) nnet.train_mode() if iteration % snapshot == 0 and rank == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks) terminate_tasks(validation_tasks)
def train(training_dbs, validation_db, system_config, model, args): print("\033[0;33m " + "现在位置:{}/{}/.{}".format(os.getcwd(), os.path.basename(__file__), sys._getframe().f_code.co_name) + "\033[0m") # reading arguments from command start_iter = args.start_iter distributed = args.distributed world_size = args.world_size initialize = args.initialize gpu = args.gpu rank = args.rank # reading arguments from json file batch_size = system_config.batch_size learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain stepsize = system_config.stepsize snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate stepsize = system_config.stepsize print("\033[1;36m " + "Process {}: building model(生成模型中)...".format(rank) + "\033[0m") nnet = NetworkFactory(system_config, model, distributed=distributed, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # 开4个队列去存数据 # queues storing data for training training_queue = Queue(system_config.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading training_tasks = init_parallel_jobs(system_config, training_dbs, training_queue, data_sampling_func, True) if val_iter: validation_tasks = init_parallel_jobs(system_config, [validation_db], validation_queue, data_sampling_func, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() # 看是否有先训练的模型 if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("Process {}: loading from pretrained model".format(rank)) nnet.load_pretrained_params(pretrained_model) # 看有没有开始的迭代器 if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.set_lr(learning_rate) print( "Process {}: training starts from iteration {} with learning_rate {}" .format(rank, start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # 训练模型 if rank == 0: print("\033[1;36m " + "training start(训练开始)...".format(rank) + "\033[0m") nnet.cuda() nnet.train_mode() # with stdout_to_tqdm() as save_stdout: # Tqdm 是一个快速,可扩展的Python进度条,可以在 Python 长循环中添加一个进度提示信息 for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss = nnet.train(**training) # 如果设置了display的步长,我们在步长整数倍时展示损失函数的值 if display and iteration % display == 0: print( "\033[1;36m " + "Process(进程){}: iteration(迭代数) [{}]时的training loss(损失函数值):" .format(rank, iteration) + "\033[0m" + "{}".format(training_loss.item())) del training_loss # 如果设置了变量迭代器步长[这边是验证集了] if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("\033[1;33m " + "Process {}:".format(rank) + "\033[0m" + "\033[1;36m " + "validation loss at iteration {}:".format(iteration) + "\033[0m" + "{}".format(validation_loss.item())) nnet.train_mode() # 快照步长 if iteration % snapshot == 0 and rank == 0: nnet.save_params(iteration) # 学习率更新步长 if iteration % stepsize == 0: learning_rate /= decay_rate print("\033[1;35m " + "此时学习率更新为:" + "\033[0m" + "{}".format(learning_rate)) nnet.set_lr(learning_rate) # sending signal to kill the thread # 杀掉进程的消息 training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes # 结束任务的消息 terminate_tasks(training_tasks) terminate_tasks(validation_tasks)
def train(train_logger, training_dbs, validation_db, system_config, model, args): # reading arguments from command start_iter = args.start_iter distributed = args.distributed world_size = args.world_size initialize = args.initialize gpu = args.gpu rank = args.rank # reading arguments from json file batch_size = system_config.batch_size learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate stepsize = system_config.stepsize train_logger.train_logging("Process {}: building model...".format(rank)) nnet = NetworkFactory(system_config, model, distributed=distributed, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # queues storing data for training training_queue = Queue(system_config.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading training_tasks = init_parallel_jobs(train_logger, system_config, training_dbs, training_queue, data_sampling_func, True) if val_iter: validation_tasks = init_parallel_jobs(train_logger, system_config, [validation_db], validation_queue, data_sampling_func, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") train_logger.train_logging( "Process {}: loading from pretrained model".format(rank)) nnet.load_pretrained_params(pretrained_model) if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.set_lr(learning_rate) train_logger.train_logging( "Process {}: training starts from iteration {} with learning_rate {}" .format(rank, start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) if rank == 0: train_logger.train_logging("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss = nnet.train(**training) train_logger.tb_logging('Train/loss', {'tloss': training_loss.item()}, iteration) if display and iteration % display == 0: train_logger.train_logging( "Process {}: training loss at iteration {}: {}".format( rank, iteration, training_loss.item())) del training_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() # calculate validation loss validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) train_logger.train_logging( "Process {}: validation loss at iteration {}: {}".format( rank, iteration, validation_loss.item())) train_logger.tb_logging('Val/loss', {'vloss': validation_loss.item()}, iteration) nnet.train_mode() if iteration % snapshot == 0 and rank == 0: nnet.eval_mode() # calculate validation mAP val_split = system_config.val_split mAP, _, detect_average_time = test(validation_db, system_config, nnet, val_iter, val_split, debug=True) train_logger.train_logging( "Process {}: mAP at iteration {}: {}".format( rank, iteration, mAP)) train_logger.train_logging( "Detect average time: {}".format(detect_average_time)) nnet.train_mode() if iteration % snapshot == 0 and rank == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # dc = 0 # handle = nvmlDeviceGetHandleByIndex(dc) # res = nvmlDeviceGetUtilizationRates(handle) # gpu_util = res.gpu # res = nvmlDeviceGetMemoryInfo(handle) # gpu_mem = res.used / 1024 / 1024 # train_logger.tb_logging('data/NV', {'gpu-util': gpu_util, 'gpu-mem': gpu_mem}, iteration) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks) terminate_tasks(validation_tasks)
def train(training_dbs, validation_db, system_config, model, args): # reading arguments from command start_iter = args.start_iter initialize = args.initialize gpu = args.gpu # reading arguments from json file learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain stepsize = system_config.stepsize snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate print("building model...") nnet = NetworkFactory(system_config, model, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # queues storing data for training training_queue = Queue(system_config.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading # parallel read train data to queue # 每个worker对应一份training_db,生成workder个并行读数据的进程 training_tasks = init_parallel_jobs(system_config, training_dbs, training_queue, data_sampling_func, True) if val_iter: validation_tasks = init_parallel_jobs(system_config, [validation_db], validation_queue, data_sampling_func, False) #设置进程信号量,线程负责把数据从training_queue读到pinned_training_queue中 training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate**(start_iter // stepsize)) learning_rate = max(1e-4, learning_rate) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start, max iteration {}".format(max_iteration)) nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss = nnet.train(training["xs"], training["ys"]) if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, training_loss.item())) # writer.add_scalar('training_loss', training_loss, global_step=iteration) print("[log-loss]:{}={}".format(iteration, training_loss.item())) writer.add_scalar('train_loss', training_loss, global_step=iteration) del training_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(validation["xs"], validation["ys"]) print("[log-validation-loss]:{}={}".format( iteration, validation_loss.item())) writer.add_scalar('validation_loss', validation_loss, global_step=iteration) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate learning_rate = max(5e-5, learning_rate) nnet.set_lr(learning_rate) print("set learning rate {}".format(learning_rate)) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks) terminate_tasks(validation_tasks) writer.close()