def train(training_dbs, validation_db, system_config, model, args): # reading arguments from command start_iter = args.start_iter distributed = args.distributed world_size = args.world_size initialize = args.initialize gpu = args.gpu rank = args.rank # reading arguments from json file batch_size = system_config.batch_size learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain stepsize = system_config.stepsize snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate stepsize = system_config.stepsize print("Process {}: building model...".format(rank)) nnet = NetworkFactory(system_config, model, distributed=distributed, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # queues storing data for training training_queue = Queue(system_config.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading training_tasks = init_parallel_jobs(system_config, training_dbs, training_queue, data_sampling_func, True) if val_iter: validation_tasks = init_parallel_jobs(system_config, [validation_db], validation_queue, data_sampling_func, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("Process {}: loading from pretrained model".format(rank)) nnet.load_pretrained_params(pretrained_model) if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.set_lr(learning_rate) print( "Process {}: training starts from iteration {} with learning_rate {}" .format(rank, start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) if rank == 0: print("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): # training = pinned_training_queue.get(block=True) # training_loss = nnet.train(**training) # # if display and iteration % display == 0: # print("Process {}: training loss at iteration {}: {}".format(rank, iteration, training_loss.item())) # del training_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("Process {}: validation loss at iteration {}: {}".format( rank, iteration, validation_loss.item())) nnet.train_mode() if iteration % snapshot == 0 and rank == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks) terminate_tasks(validation_tasks)
def train(training_dbs, validation_db, system_config, model, args): print("\033[0;33m " + "现在位置:{}/{}/.{}".format(os.getcwd(), os.path.basename(__file__), sys._getframe().f_code.co_name) + "\033[0m") # reading arguments from command start_iter = args.start_iter distributed = args.distributed world_size = args.world_size initialize = args.initialize gpu = args.gpu rank = args.rank # reading arguments from json file batch_size = system_config.batch_size learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain stepsize = system_config.stepsize snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate stepsize = system_config.stepsize print("\033[1;36m " + "Process {}: building model(生成模型中)...".format(rank) + "\033[0m") nnet = NetworkFactory(system_config, model, distributed=distributed, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # 开4个队列去存数据 # queues storing data for training training_queue = Queue(system_config.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading training_tasks = init_parallel_jobs(system_config, training_dbs, training_queue, data_sampling_func, True) if val_iter: validation_tasks = init_parallel_jobs(system_config, [validation_db], validation_queue, data_sampling_func, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() # 看是否有先训练的模型 if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("Process {}: loading from pretrained model".format(rank)) nnet.load_pretrained_params(pretrained_model) # 看有没有开始的迭代器 if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.set_lr(learning_rate) print( "Process {}: training starts from iteration {} with learning_rate {}" .format(rank, start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) # 训练模型 if rank == 0: print("\033[1;36m " + "training start(训练开始)...".format(rank) + "\033[0m") nnet.cuda() nnet.train_mode() # with stdout_to_tqdm() as save_stdout: # Tqdm 是一个快速,可扩展的Python进度条,可以在 Python 长循环中添加一个进度提示信息 for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss = nnet.train(**training) # 如果设置了display的步长,我们在步长整数倍时展示损失函数的值 if display and iteration % display == 0: print( "\033[1;36m " + "Process(进程){}: iteration(迭代数) [{}]时的training loss(损失函数值):" .format(rank, iteration) + "\033[0m" + "{}".format(training_loss.item())) del training_loss # 如果设置了变量迭代器步长[这边是验证集了] if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("\033[1;33m " + "Process {}:".format(rank) + "\033[0m" + "\033[1;36m " + "validation loss at iteration {}:".format(iteration) + "\033[0m" + "{}".format(validation_loss.item())) nnet.train_mode() # 快照步长 if iteration % snapshot == 0 and rank == 0: nnet.save_params(iteration) # 学习率更新步长 if iteration % stepsize == 0: learning_rate /= decay_rate print("\033[1;35m " + "此时学习率更新为:" + "\033[0m" + "{}".format(learning_rate)) nnet.set_lr(learning_rate) # sending signal to kill the thread # 杀掉进程的消息 training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes # 结束任务的消息 terminate_tasks(training_tasks) terminate_tasks(validation_tasks)
def train(training_dbs, validation_db, system_config, model, args): # reading arguments from command start_iter = args.start_iter initialize = args.initialize gpu = args.gpu # reading arguments from json file learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain stepsize = system_config.stepsize snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate print("building model...") nnet = NetworkFactory(system_config, model, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # queues storing data for training training_queue = Queue(system_config.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading # parallel read train data to queue # 每个worker对应一份training_db,生成workder个并行读数据的进程 training_tasks = init_parallel_jobs(system_config, training_dbs, training_queue, data_sampling_func, True) if val_iter: validation_tasks = init_parallel_jobs(system_config, [validation_db], validation_queue, data_sampling_func, False) #设置进程信号量,线程负责把数据从training_queue读到pinned_training_queue中 training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate**(start_iter // stepsize)) learning_rate = max(1e-4, learning_rate) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start, max iteration {}".format(max_iteration)) nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss = nnet.train(training["xs"], training["ys"]) if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, training_loss.item())) # writer.add_scalar('training_loss', training_loss, global_step=iteration) print("[log-loss]:{}={}".format(iteration, training_loss.item())) writer.add_scalar('train_loss', training_loss, global_step=iteration) del training_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(validation["xs"], validation["ys"]) print("[log-validation-loss]:{}={}".format( iteration, validation_loss.item())) writer.add_scalar('validation_loss', validation_loss, global_step=iteration) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate learning_rate = max(5e-5, learning_rate) nnet.set_lr(learning_rate) print("set learning rate {}".format(learning_rate)) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks) terminate_tasks(validation_tasks) writer.close()
def train(train_logger, training_dbs, validation_db, system_config, model, args): # reading arguments from command start_iter = args.start_iter distributed = args.distributed world_size = args.world_size initialize = args.initialize gpu = args.gpu rank = args.rank # reading arguments from json file batch_size = system_config.batch_size learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate stepsize = system_config.stepsize train_logger.train_logging("Process {}: building model...".format(rank)) nnet = NetworkFactory(system_config, model, distributed=distributed, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # queues storing data for training training_queue = Queue(system_config.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading training_tasks = init_parallel_jobs(train_logger, system_config, training_dbs, training_queue, data_sampling_func, True) if val_iter: validation_tasks = init_parallel_jobs(train_logger, system_config, [validation_db], validation_queue, data_sampling_func, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") train_logger.train_logging( "Process {}: loading from pretrained model".format(rank)) nnet.load_pretrained_params(pretrained_model) if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.set_lr(learning_rate) train_logger.train_logging( "Process {}: training starts from iteration {} with learning_rate {}" .format(rank, start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) if rank == 0: train_logger.train_logging("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss = nnet.train(**training) train_logger.tb_logging('Train/loss', {'tloss': training_loss.item()}, iteration) if display and iteration % display == 0: train_logger.train_logging( "Process {}: training loss at iteration {}: {}".format( rank, iteration, training_loss.item())) del training_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() # calculate validation loss validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) train_logger.train_logging( "Process {}: validation loss at iteration {}: {}".format( rank, iteration, validation_loss.item())) train_logger.tb_logging('Val/loss', {'vloss': validation_loss.item()}, iteration) nnet.train_mode() if iteration % snapshot == 0 and rank == 0: nnet.eval_mode() # calculate validation mAP val_split = system_config.val_split mAP, _, detect_average_time = test(validation_db, system_config, nnet, val_iter, val_split, debug=True) train_logger.train_logging( "Process {}: mAP at iteration {}: {}".format( rank, iteration, mAP)) train_logger.train_logging( "Detect average time: {}".format(detect_average_time)) nnet.train_mode() if iteration % snapshot == 0 and rank == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # dc = 0 # handle = nvmlDeviceGetHandleByIndex(dc) # res = nvmlDeviceGetUtilizationRates(handle) # gpu_util = res.gpu # res = nvmlDeviceGetMemoryInfo(handle) # gpu_mem = res.used / 1024 / 1024 # train_logger.tb_logging('data/NV', {'gpu-util': gpu_util, 'gpu-mem': gpu_mem}, iteration) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks) terminate_tasks(validation_tasks)
def Train(self, display_interval=100): # reading arguments from command start_iter = self.system_dict["training"]["params"]["start_iter"] distributed = self.system_dict["model"]["params"]["distributed"] world_size = self.system_dict["model"]["params"]["world_size"] initialize = self.system_dict["model"]["params"]["initialize"] gpu = None rank = self.system_dict["model"]["params"]["rank"] # reading arguments from json file batch_size = self.system_dict["dataset"]["params"]["batch_size"] learning_rate = self.system_dict["training"]["params"]["lr"] max_iteration = self.system_dict["training"]["params"]["total_iterations"] pretrained_model = None; stepsize = int(self.system_dict["training"]["params"]["total_iterations"]*0.8) snapshot = int(self.system_dict["training"]["params"]["total_iterations"]*0.5) val_iter = self.system_dict["training"]["params"]["val_interval"] display = display_interval decay_rate = self.system_dict["local"]["system_config"].decay_rate print("start_iter = {}".format(start_iter)); print("distributed = {}".format(distributed)); print("world_size = {}".format(world_size)); print("initialize = {}".format(initialize)); print("batch_size = {}".format(batch_size)); print("learning_rate = {}".format(learning_rate)); print("max_iteration = {}".format(max_iteration)); print("stepsize = {}".format(stepsize)); print("snapshot = {}".format(snapshot)); print("val_iter = {}".format(val_iter)); print("display = {}".format(display)); print("decay_rate = {}".format(decay_rate)); print("Process {}: building model...".format(rank)) self.system_dict["local"]["nnet"] = NetworkFactory(self.system_dict["local"]["system_config"], self.system_dict["local"]["model"], distributed=distributed, gpu=gpu) # queues storing data for training training_queue = Queue(self.system_dict["local"]["system_config"].prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(self.system_dict["local"]["system_config"].prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading training_tasks = init_parallel_jobs(self.system_dict["local"]["system_config"], self.system_dict["local"]["training_dbs"], training_queue, data_sampling_func, True) if self.system_dict["dataset"]["val"]["status"]: validation_tasks = init_parallel_jobs(self.system_dict["local"]["system_config"], [self.system_dict["local"]["validation_db"]], validation_queue, data_sampling_func, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("Process {}: loading from pretrained model".format(rank)) self.system_dict["local"]["nnet"].load_pretrained_params(pretrained_model) if start_iter: self.system_dict["local"]["nnet"].load_params(start_iter) learning_rate /= (decay_rate ** (start_iter // stepsize)) self.system_dict["local"]["nnet"].set_lr(learning_rate) print("Process {}: training starts from iteration {} with learning_rate {}".format(rank, start_iter + 1, learning_rate)) else: self.system_dict["local"]["nnet"].set_lr(learning_rate) if rank == 0: print("training start...") self.system_dict["local"]["nnet"].cuda() self.system_dict["local"]["nnet"].train_mode() if(self.system_dict["dataset"]["val"]["status"]): old_val_loss = 100000.0; with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss = self.system_dict["local"]["nnet"].train(**training) if display and iteration % display == 0: print("Process {}: training loss at iteration {}: {}".format(rank, iteration, training_loss.item())) del training_loss if val_iter and self.system_dict["local"]["validation_db"].db_inds.size and iteration % val_iter == 0: self.system_dict["local"]["nnet"].eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = self.system_dict["local"]["nnet"].validate(**validation) print("Process {}: validation loss at iteration {}: {}".format(rank, iteration, validation_loss.item())) if(validation_loss < old_val_loss): print("Loss Reduced from {} to {}".format(old_val_loss, validation_loss)) self.system_dict["local"]["nnet"].save_params("best"); old_val_loss = validation_loss; else: print("validation loss did not go below {}, current loss - {}".format(old_val_loss, validation_loss)) self.system_dict["local"]["nnet"].train_mode() if iteration % stepsize == 0: learning_rate /= decay_rate self.system_dict["local"]["nnet"].set_lr(learning_rate) self.system_dict["local"]["nnet"].save_params("final"); # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks) terminate_tasks(validation_tasks) else: with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss = self.system_dict["local"]["nnet"].train(**training) if display and iteration % display == 0: print("Process {}: training loss at iteration {}: {}".format(rank, iteration, training_loss.item())) del training_loss if(iteration % val_iter == 0): self.system_dict["local"]["nnet"].save_params("intermediate"); if iteration % stepsize == 0: learning_rate /= decay_rate self.system_dict["local"]["nnet"].set_lr(learning_rate) self.system_dict["local"]["nnet"].save_params("final"); # sending signal to kill the thread training_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks)