def train(model, train_list, test_list): train_dataset = MyDataset(train_list) train_dataloader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True, num_workers=2, collate_fn=collate_fn, drop_last=True) model.train() # 计算所有epoch进行参数优化的总步数total_steps total_steps = int(train_dataset.__len__() * Config.epochs / Config.batch_size / Config.gradient_accumulation) print("total train step num: {}".format(total_steps)) optimizer = BertAdam(model.parameters(), lr=Config.lr, warmup=0.05, t_total=total_steps) print('start training...') # 开始训练 for epoch in range(Config.epochs): epoch_start_time = datetime.now() for batch_idx, input_ids in enumerate(train_dataloader): # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token input_ids = input_ids.to(Config.device) outputs = model.forward(input_ids=input_ids) loss, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm) optimizer.step() optimizer.zero_grad() print('epoch:{}, step:{}, loss: {:6f}, accuracy:{:6f}'.format( epoch + 1, batch_idx + 1, loss, accuracy)) average_acc, average_loss = evaluate(model, test_list) res = "VALID epoch:{}, loss {:6f}, acc {:6f}".format( epoch, average_loss, average_acc) print(res) res += '\n' with open('log.txt', 'a+') as f: f.write(res) # 一个epoch跑完保存一下模型 model_path = join(Config.model_output_path, 'model_epoch{}'.format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_path) epoch_finish_time = datetime.now() print('跑完一个epoch花费时间为: {}'.format(epoch_finish_time - epoch_start_time))
def detect_test_data(model_path='weights/face_mask_weights.pth', num=20): yolo_detector = YOLO4_inference(model_path=model_path) test_data = MyDataset(test_root) for i in random.sample(range(test_data.__len__()), num): img, gt = test_data.getRaw(i) image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) boxes, labels, scores = yolo_detector.predict(image) detect_test_data_show(image, gt, boxes, labels, scores)
def train(args): print(args) ds_train = MyDataset(My_PATH, set='train') ds_val = MyDataset(My_PATH, set='val') loader_train = data_utils.DataLoader(ds_train, batch_size=args.batch_size, num_workers=args.nb_worker, shuffle=True) loader_val = data_utils.DataLoader(ds_val, batch_size=1, num_workers=1, shuffle=False) model = torch.load('./models/2.pkl') model = model.cuda(0) trainAcc = 0 trainNum = ds_train.__len__() for i, (images, label) in enumerate(loader_train): images = images.cuda(0) label = label.cuda(0) images = Variable(images) label = Variable(label) outputs = model(images) _, pred = torch.max(outputs.data, 1) trainAcc += torch.sum(pred == label.data) print(i) print(trainAcc) print('-----------------------------') valAcc = 0 for i, (images, label) in enumerate(loader_val): images = images.cuda(0) images = Variable(images) label = label.cuda(0) outputs = model(images) _, pred = torch.max(outputs.data, 1) valAcc += torch.sum(pred == label) print(i) print(valAcc) print( "Epoch [%d/%d],trainAcc: %.4f,valAcc: %.4f" % (1, args.nb_epoch, int(trainAcc) * 1.0 / trainNum, int(valAcc) * 1.0 / (i + 1)))
def train(model, device, train_list, multi_gpu, args): train_dataset = MyDataset(train_list) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size / args.gradient_accumulation) logger.info('total training steps = {}'.format(total_steps)) save_step = max(int(args.save_step_percentage * total_steps), 1) logger.info('save per {} steps'.format(save_step)) optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=total_steps) logger.info('starting training') running_loss = 0 overall_step = -1 model_path = join(args.model_output_path, "saved.pt") if os.path.exists(model_path): checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) running_loss = checkpoint['running_loss'] overall_step = checkpoint['overall_step'] logger.info("running loss:{}, overall step:{}".format( running_loss, overall_step)) tb_writer = SummaryWriter(log_dir=args.writer_dir) oom_time = 0 model.train() oom_flag = False epoch_start_time = datetime.now() for batch_idx, input_ids in enumerate(train_dataloader): if batch_idx <= overall_step: continue input_ids = input_ids.to(device) try: mu, logvar, bow_probs = model.forward(input=input_ids) bow_loss = calculate_bow(bow_probs, input_ids, device) loss = bow_loss if multi_gpu: loss = loss.mean() if args.gradient_accumulation > 1: loss = loss / args.gradient_accumulation loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if (batch_idx + 1) % args.gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() overall_step += 1 if (overall_step + 1) % args.log_step == 0 or (overall_step + 1 == total_steps): logger.info("step {}, loss {:.6}".format( overall_step, loss)) tb_writer.add_scalar('loss', loss.item(), overall_step) if (overall_step + 1) % save_step == 0 or (overall_step == total_steps): logger.info('saving for step {}'.format(overall_step)) if not os.path.exists(args.model_output_path): os.mkdir(args.model_output_path) torch.save( { # 'finished_epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'overall_step': overall_step, 'running_loss': running_loss }, model_path) logger.info('finish saving for step {}'.format(overall_step)) except RuntimeError as exception: if "out of memory" in str(exception): oom_time += 1 if not oom_flag: logger.info("WARNING: ran out of memory,times: {}".format( oom_time)) logger.info("batch_idx = ", batch_idx) oom_flag = True if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(exception)) raise exception epoch_finish_time = datetime.now() logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time)) logger.info('training finished')
def train(model, device, train_list, multi_gpu, args, valid_list, test_list): train_dataset = MyDataset(train_list) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) model.train() # 计算所有epoch进行参数优化的总步数total_steps total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size / args.gradient_accumulation) logger.info('total training steps = {}'.format(total_steps)) # 设置优化器,并且在初始训练时,使用warmup策略 optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=total_steps) logger.info('starting training') # 用于统计每次梯度累计的loss running_loss = 0 # 统计一共训练了多少个step overall_step = 0 # 记录tensorboardX tb_writer = SummaryWriter(log_dir=args.writer_dir) # 记录 out of memory的次数 oom_time = 0 # 开始训练 for epoch in range(args.epochs): epoch_start_time = datetime.now() for batch_idx, input_ids in enumerate(train_dataloader): # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token input_ids = input_ids.to(device) # 解决在运行过程中,由于显存不足产生的cuda out of memory的问题 try: outputs = model.forward(input_ids=input_ids) loss, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids, device=device) if multi_gpu: loss = loss.mean() accuracy = accuracy.mean() if args.gradient_accumulation > 1: loss = loss / args.gradient_accumulation accuracy = accuracy / args.gradient_accumulation loss.backward() # 梯度裁剪解决的是梯度消失或爆炸的问题,即设定阈值 torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # 进行一定step的梯度累计之后,更新参数 if (batch_idx + 1) % args.gradient_accumulation == 0: running_loss += loss.item() # 更新参数 optimizer.step() # 清空梯度信息 optimizer.zero_grad() # 进行warm up scheduler.step() overall_step += 1 # 更新日志与tnesorboardX信息 if (overall_step + 1) % args.log_step == 0: logger.info( "batch {} of epoch {}, loss {}, accuracy {}".format(batch_idx + 1, epoch + 1, loss, accuracy)) tb_writer.add_scalar('loss', loss.item(), overall_step) except RuntimeError as exception: if "out of memory" in str(exception): oom_time += 1 logger.info("WARNING: ran out of memory,times: {}".format(oom_time)) if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(exception)) raise exception logger.info('saving model for epoch {}'.format(epoch + 1)) if args.train_mmi: # 当前训练MMI模型 model_path = join(args.mmi_model_output_path, 'model_epoch{}'.format(epoch + 1)) else: # 当前训练对话模型 model_path = join(args.dialogue_model_output_path, 'model_epoch{}'.format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_path) logger.info('epoch {} finished'.format(epoch + 1)) epoch_finish_time = datetime.now() logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time)) logger.info ("Start Valid Set") evaluate(model, device, valid_list, multi_gpu, args) logger.info ("Start Test Set") evaluate(model, device, test_list, multi_gpu, args) logger.info('training finished')
def train(model, device, train_list, args): train_dataset = MyDataset(train_list) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size) logger.info('total training steps = {}'.format(total_steps)) save_step = max(int(args.save_step_percentage * total_steps), 1) logger.info('save per {} steps'.format(save_step)) optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_steps, t_total=total_steps) logger.info('starting training') running_loss = 0 overall_step = -1 kl_anneal_x0 = int(total_steps * args.kl_anneal_percentage) model_path = join(args.model_output_path, "saved.pt") if os.path.exists(model_path): checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) # finished_epoch = checkpoint['finished_epoch'] + 1 running_loss = checkpoint['running_loss'] overall_step = checkpoint['overall_step'] logger.info("running loss:{}, overall step:{}".format( running_loss, overall_step)) tb_writer = SummaryWriter(log_dir=args.writer_dir) oom_time = 0 model.train() oom_flag = False # for epoch in range(finished_epoch, args.epochs): epoch_start_time = datetime.now() for batch_idx, input_ids in enumerate(train_dataloader): if batch_idx <= overall_step: continue input_ids = input_ids.to(device) try: outputs, mu, logvar, bow_probs = model.forward(input=input_ids) # anneal_function, step, k, x0 ce, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids, device=device) kl_weight = min( 0.5, kl_anneal_function(anneal_function=args.kl_anneal_function, step=overall_step, k=args.kl_anneal_k, x0=kl_anneal_x0)) kld = (-0.5 * torch.sum(logvar - torch.pow(mu, 2) - torch.exp(logvar) + 1, 1)).mean().squeeze() bow_loss = calculate_bow(bow_probs, input_ids, device) loss = ce + kl_weight * kld + args.bow_weight * bow_loss loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() overall_step += 1 if overall_step == 0 or ( overall_step + 1) % args.log_step == 0 or (overall_step + 1 == total_steps): logger.info( "step {}, ce {:.6}, kld {:.6}, kl_weight {:.6}, bow {:.6}, bow_weight {:.6}, loss {:.6}, accuracy {:.6}" .format(overall_step, ce, kld, kl_weight, bow_loss, args.bow_weight, loss, accuracy)) tb_writer.add_scalar('ce', ce.item(), overall_step) tb_writer.add_scalar('kld', kld.item(), overall_step) tb_writer.add_scalar('loss', loss.item(), overall_step) if (overall_step + 1) % save_step == 0 or (overall_step + 1 == total_steps): logger.info('saving for step {}'.format(overall_step)) if not os.path.exists(args.model_output_path): os.mkdir(args.model_output_path) torch.save( { # 'finished_epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'overall_step': overall_step, 'running_loss': running_loss }, model_path) decoder_path = join(args.model_output_path, 'decoder/') if not os.path.exists(decoder_path): os.mkdir(decoder_path) model.save_decoder(decoder_path) logger.info('finish saving for step {}'.format(overall_step)) except RuntimeError as exception: if "out of memory" in str(exception): oom_time += 1 if not oom_flag: logger.info("WARNING: ran out of memory,times: {}".format( oom_time)) logger.info("batch_idx = ", batch_idx) oom_flag = True if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(exception)) raise exception epoch_finish_time = datetime.now() logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time)) logger.info('training finished')
def train(model, device, train_list, multi_gpu, args, logdir): train_dataset = MyDataset(train_list, args.tensor_cache) # loader_batch_size = int(args.batch_size / args.gradient_accumulation / torch.cuda.device_count()) # print(loader_batch_size) if args.distributed: loader_batch_size = int(args.batch_size / args.gradient_accumulation / torch.cuda.device_count()) sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, batch_size=loader_batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn, sampler=sampler) else: loader_batch_size = int(args.batch_size / args.gradient_accumulation) train_dataloader = DataLoader(train_dataset, batch_size=loader_batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) print(loader_batch_size) model.train() # 计算所有epoch进行参数优化的总步数total_steps total_steps = int(train_dataset.__len__() * args.epochs / args.batch_size) logger.info('total training steps = {}'.format(total_steps)) # 设置优化器,并且在初始训练时,使用warmup策略 base_optimizer = transformers.AdamW(model.parameters(), lr=args.lr, correct_bias=True) # if args.linear_schedule: # optimizer = NoamOpt(model.embeddings_size, args.warmup_steps, base_optimizer, lr=args.lr, linear_schedule=True, # total_steps=total_steps, apex_level=None, loss_weight=None, # extra_module_lr_rate=1) # else: # optimizer = NoamOpt(model.embeddings_size, args.warmup_steps, base_optimizer, lr=args.lr, linear_schedule=False, # apex_level=None, loss_weight=None, extra_module_lr_rate=1) optimizer = NoamOpt(768, args.warmup_steps, base_optimizer, lr=args.lr, linear_schedule=True, total_steps=total_steps, apex_level=None, loss_weight=None, extra_module_lr_rate=1) logger.info('starting training') # 用于统计每次梯度累计的loss running_loss = 0 # 统计一共训练了多少个step overall_step = 0 log_loss = 0 log_acc = 0 log_ppl = 0 # 记录tensorboardX tb_writer = SummaryWriter(log_dir=args.writer_dir) # 记录 out of memory的次数 oom_time = 0 # 开始训练 for epoch in tqdm(range(args.epochs), desc='Epoch'): tqdm_data = tqdm(train_dataloader, desc="Train (epoch #{})".format(epoch)) for batch_idx, input_ids in enumerate(tqdm_data): # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token input_ids = input_ids.to(device) # 解决在运行过程中,由于显存不足产生的cuda out of memory的问题 rest_size = 0 if device == 'cpu' else ( torch.cuda.device_count() - input_ids.size(0) % torch.cuda.device_count()) if rest_size != 0: input_ids = torch.cat([input_ids] + [input_ids[:1, :]] * rest_size, dim=0) try: outputs = model.forward(input_ids=input_ids) loss, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids, device=device) if multi_gpu: loss = loss.mean() accuracy = accuracy.mean() if args.gradient_accumulation > 1: loss = loss / args.gradient_accumulation accuracy = accuracy / args.gradient_accumulation loss.backward() # 梯度裁剪解决的是梯度消失或爆炸的问题,即设定阈值 torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # 进行一定step的梯度累计之后,更新参数 if (batch_idx + 1) % args.gradient_accumulation == 0: running_loss += loss.item() # 更新参数 optimizer.step() # 清空梯度信息 optimizer.zero_grad() # 进行warm up overall_step += 1 log_loss = (log_loss * batch_idx + loss.item()) / (batch_idx + 1) log_acc = (log_acc * batch_idx + accuracy.item()) / (batch_idx + 1) tqdm_data.set_postfix({'loss': log_loss, 'acc': log_acc}) # 更新日志与tnesorboardX信息 if (overall_step + 1) % args.log_step == 0: if args.local_rank in [-1, 0]: logger.info('loss is %.5f, acc is %.5f', log_loss, log_acc) tb_writer.add_scalar('loss', loss.item(), overall_step) except RuntimeError as exception: if "out of memory" in str(exception): oom_time += 1 logger.info("WARNING: ran out of memory,times: {}".format( oom_time)) if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(exception)) raise exception if args.local_rank in [-1, 0]: logger.info('saving model for epoch {}'.format(epoch + 1)) model_path = join(logdir, 'model_epoch{}'.format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_path) logger.info('epoch {} finished'.format(epoch + 1)) logger.info('training finished')
def train(model, train_list): train_dataset = MyDataset(train_list) train_dataloader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True, num_workers=2, collate_fn=collate_fn, drop_last=True) model.train() # 计算所有epoch进行参数优化的总步数total_steps total_steps = int(train_dataset.__len__() * Config.epochs / Config.batch_size / Config.gradient_accumulation) print("total train step num: {}".format(total_steps)) # 设置优化器,并且在初始训练时,使用warmup策略 optimizer = transformers.AdamW(model.parameters(), lr=Config.lr, correct_bias=True) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=Config.warmup_steps, num_training_steps=total_steps) print("start training ...") running_loss = 0 # 用于统计每次梯度累计的loss overall_step = 0 # 统计一共训练了多少个step tb_writer = SummaryWriter(log_dir=Config.writer_dir) # 日志输出文件 # 开始训练 for epoch in range(Config.epochs): epoch_start_time = datetime.now() for batch_idx, input_ids in enumerate(train_dataloader): # print(batch_idx) # print(input_ids.size()) # torch.Size([2, 208]) 这里的max_len是按批填充的 # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token input_ids = input_ids.to(Config.device) outputs = model.forward(input_ids=input_ids) loss, accuracy = calculate_loss_and_accuracy(outputs, labels=input_ids) if Config.gradient_accumulation > 1: loss = loss / Config.gradient_accumulation accuracy = accuracy / Config.gradient_accumulation loss.backward() # 梯度裁剪 torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm) # 进行一定的step的梯度累计之后, 更新参数 if batch_idx + 1 % Config.gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() # 进行warm_up scheduler.step() overall_step += 1 if overall_step + 1 % Config.log_step == 0: print('epoch:{}, step:{}, loss: {}, accuracy:{}'.format( epoch + 1, batch_idx + 1, loss, accuracy)) tb_writer.add_scalar('loss', loss.item(), overall_step) # 一个epoch跑完保存一下模型 model_path = join(Config.model_output_path, 'model_epoch{}'.format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_path) epoch_finish_time = datetime.now() print('跑完一个epoch花费时间为: {}'.format(epoch_finish_time - epoch_start_time))
def train(model, device, train_list, multi_gpu, hparams): train_data = MyDataset(train_list) train_dataloader = DataLoader(train_data, batch_size=hparams.batch_size, shuffle=True, num_workers=hparams.num_workers, collate_fn=collate) model.train() total_steps = int(train_data.__len__() * hparams.epochs / hparams.batch_size / hparams.gradient_accumulation) logger.info('total training steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=hparams.lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule( optimizer, warmup_steps=hparams.warmup_steps, t_total=total_steps) logger.info('starting training') run_loss = 0 over_step = 0 tb_writer = SummaryWriter(log_dir=hparams.writer_dir) oom_time = 0 # out of memory 次数 for epoch in range(hparams.epochs): start_time = datetime.now() for batch_index, input_ids in enumerate(train_dataloader): # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token input_ids = input_ids.to(device) try: outputs = model.forward(input_ids=input_ids) loss, accuracy = cal_loss_accuracy(outputs, input_ids, device) if multi_gpu: loss = loss.mean() accuracy = accuracy.mean() if hparams.gradient_accumulation > 1: loss = loss / hparams.gradient_accumulation accuracy = accuracy / hparams.gradient_accumulation loss.backward() # 解决梯度爆照和消失问题 torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.max_grad_norm) if (batch_index + 1) % hparams.gradient_accumulation == 0: run_loss += loss.item() optimizer.step() optimizer.zero_grad() # 清空梯度信息 scheduler.step() over_step += 1 if (over_step + 1) % hparams.log_step == 0: logger.info( "batch {} of epoch {}, loss {}, accuracy {}". format(batch_index + 1, epoch + 1, loss, accuracy)) tb_writer.add_scalar('loss', loss.item(), over_step) except RuntimeError as e: if "out of memory" in str(e): oom_time += 1 logger.info("WARNING: ran out of memory,times: {}".format( oom_time)) if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: logger.info(str(e)) logger.info('saving model for epoch {}'.format(epoch + 1)) if hparams.train_mmi: model_path = join(hparams.mmi_model_output_path, "model_epoch{}".format(epoch + 1)) else: model_path = join(hparams.dialog_model_output_path, "model_epoch{}".format(epoch + 1)) if not os.path.exists(model_path): os.mkdir(model_path) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(model_path) logger.info('epoch {} finished'.format(epoch + 1)) epoch_finish_time = datetime.now() logger.info('time for one epoch: {}'.format(epoch_finish_time - start_time)) logger.info('training finished')
def train(args): print(args) ds_train = MyDataset(My_PATH, set='train') ds_val = MyDataset(My_PATH, set='val') loader_train = data_utils.DataLoader(ds_train, batch_size=args.batch_size, num_workers=args.nb_worker,shuffle=True) loader_val = data_utils.DataLoader(ds_val, batch_size=1, num_workers=1,shuffle=False) model = models.vgg16(pretrained = True) model.classifier = torch.nn.Sequential(torch.nn.Linear(25088, 4096), torch.nn.ReLU(), torch.nn.Dropout(p=0.5), torch.nn.Linear(4096, 4096), torch.nn.ReLU(), torch.nn.Dropout(p=0.5), torch.nn.Linear(4096, 2)) print ("init_params done.") model=model.cuda(0) if not os.path.exists("./models"): os.mkdir ("./models") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) cost=torch.nn.CrossEntropyLoss() cost=cost.cuda(0) for epoch in range(args.nb_epoch): starttime = datetime.datetime.now() trainAcc = 0 trainNum = ds_train.__len__() for i, (images, label) in enumerate(loader_train): images = images.cuda(0) label = label.cuda(0) images = Variable(images) label = Variable(label) optimizer.zero_grad() outputs = model(images) loss = cost(outputs, label) _, pred = torch.max(outputs.data, 1) trainAcc += torch.sum(pred == label.data) loss.backward() optimizer.step() valAcc = 0 for i, (images, label) in enumerate(loader_val): images=images.cuda(0) images = Variable(images) label=label.cuda(0) outputs = model(images) _, pred = torch.max(outputs.data, 1) valAcc += torch.sum(pred == label) print("Epoch [%d/%d] Loss: %.6f,trainAcc: %.4f,valAcc: %.4f" % ( epoch + 1, args.nb_epoch, loss.data[0], int(trainAcc)* 1.0 / trainNum,int(valAcc)* 1.0 / (i + 1))) logging.info("Epoch [%d/%d] Loss: %.6f,trainAcc: %.4f,valAcc: %.4f" % ( epoch + 1, args.nb_epoch, loss.data[0],int(trainAcc)* 1.0 / trainNum,int(valAcc)* 1.0 / (i + 1))) torch.save(model, "./models/{}.pkl".format(epoch)) endtime=datetime.datetime.now() print((endtime-starttime).seconds) logging.info('time:{}'.format((endtime-starttime).seconds))