def infer(self, model, epoch=0): top1 = utils.AverageMeter() top5 = utils.AverageMeter() data_time = utils.AverageMeter() batch_time = utils.AverageMeter() model.eval() start = time.time() prefetcher = data_prefetcher(self.val_data) input, target = prefetcher.next() step = 0 while input is not None: step += 1 data_t = time.time() - start n = input.size(0) logits, logits_aux = model(input) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) batch_t = time.time() - start top1.update(prec1.item(), n) top5.update(prec5.item(), n) data_time.update(data_t) batch_time.update(batch_t) if step % self.report_freq == 0: logging.info('Val epoch %03d step %03d | top1_acc %.2f top5_acc %.2f | batch_time %.3f data_time %.3f', epoch, step, top1.avg, top5.avg, batch_time.avg, data_time.avg) start = time.time() input, target = prefetcher.next() logging.info('EPOCH%d Valid_acc top1 %.2f top5 %.2f batch_time %.3f data_time %.3f', epoch, top1.avg, top5.avg, batch_time.avg, data_time.avg) return top1.avg, top5.avg, batch_time.avg, data_time.avg
def validate(logger, writer, device, config, valid_loader, model, epoch, cur_step): top1 = utils.AverageMeter() top5 = utils.AverageMeter() losses = utils.AverageMeter() model.eval() with torch.no_grad(): for step, (X, y) in enumerate(valid_loader): X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True) N = X.size(0) logits = model(X) loss = model.module.criterion(logits, y) prec1, prec5 = utils.accuracy(logits, y, topk=(1, 5)) losses.update(loss.item(), N) top1.update(prec1.item(), N) top5.update(prec5.item(), N) if step % config.print_freq == 0 or step == len(valid_loader)-1: logger.info( "Valid: [{:2d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( epoch+1, config.epochs, step, len(valid_loader)-1, losses=losses, top1=top1, top5=top5)) writer.add_scalar('val/loss', losses.avg, cur_step) writer.add_scalar('val/top1', top1.avg, cur_step) writer.add_scalar('val/top5', top5.avg, cur_step) logger.info("Valid: [{:2d}/{}] Final Prec@1 {:.4%}".format(epoch+1, config.epochs, top1.avg)) return top1.avg
def train(logger, writer, device, config, train_loader, model, optimizer, criterion, epoch): top1 = utils.AverageMeter() top5 = utils.AverageMeter() losses = utils.AverageMeter() cur_step = epoch * len(train_loader) cur_lr = optimizer.param_groups[0]['lr'] logger.info("Epoch {} LR {}".format(epoch, cur_lr)) writer.add_scalar('train/lr', cur_lr, cur_step) model.train() for step, (X, y) in enumerate(train_loader): X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True) N = X.size(0) optimizer.zero_grad() logits, aux_logits = model(X) loss = criterion(logits, y) if config.aux_weight > 0.: loss += config.aux_weight * criterion(aux_logits, y) loss.backward() # gradient clipping nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) if config.dist_privacy: # privice gradient clipping clipping_dispatcher(model.module.named_weights(), config.max_weights_grad_norm, config.var_gamma, device, logger) optimizer.step() prec1, prec5 = utils.accuracy(logits, y, topk=(1, 5)) losses.update(loss.item(), N) top1.update(prec1.item(), N) top5.update(prec5.item(), N) if step % config.print_freq == 0 or step == len(train_loader) - 1: logger.info( "Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( epoch + 1, config.epochs, step, len(train_loader) - 1, losses=losses, top1=top1, top5=top5)) writer.add_scalar('train/loss', loss.item(), cur_step) writer.add_scalar('train/top1', prec1.item(), cur_step) writer.add_scalar('train/top5', prec5.item(), cur_step) cur_step += 1 logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format( epoch + 1, config.epochs, top1.avg))
def train(self, model, epoch): objs = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() data_time = utils.AverageMeter() batch_time = utils.AverageMeter() model.train() start = time.time() prefetcher = data_prefetcher(self.train_data) input, target = prefetcher.next() step = 0 while input is not None: data_t = time.time() - start self.scheduler.step() n = input.size(0) if step == 0: logging.info('epoch %d lr %e', epoch, self.optimizer.param_groups[0]['lr']) self.optimizer.zero_grad() logits = model(input) if self.config.optim.label_smooth: loss = self.criterion(logits, target, self.config.optim.smooth_alpha) else: loss = self.criterion(logits, target) loss.backward() if self.config.optim.use_grad_clip: nn.utils.clip_grad_norm_(model.parameters(), self.config.optim.grad_clip) self.optimizer.step() prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) batch_t = time.time() - start start = time.time() objs.update(loss.item(), n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) data_time.update(data_t) batch_time.update(batch_t) if step != 0 and step % self.report_freq == 0: logging.info( 'Train epoch %03d step %03d | loss %.4f top1_acc %.2f top5_acc %.2f | batch_time %.3f data_time %.3f', epoch, step, objs.avg, top1.avg, top5.avg, batch_time.avg, data_time.avg) input, target = prefetcher.next() step += 1 logging.info( 'EPOCH%d Train_acc top1 %.2f top5 %.2f batch_time %.3f data_time %.3f', epoch, top1.avg, top5.avg, batch_time.avg, data_time.avg) return top1.avg, top5.avg, objs.avg, batch_time.avg, data_time.avg
def train(logger, writer, device, config, train_loader, valid_loader, model, architect, w_optim, alpha_optim, lr, epoch): top1 = utils.AverageMeter() top5 = utils.AverageMeter() losses = utils.AverageMeter() cur_step = epoch*len(train_loader) writer.add_scalar('train/lr', lr, cur_step) model.train() for step, ((trn_X, trn_y), (val_X, val_y)) in enumerate(zip(train_loader, valid_loader)): trn_X, trn_y = trn_X.to(device, non_blocking=True), trn_y.to(device, non_blocking=True) val_X, val_y = val_X.to(device, non_blocking=True), val_y.to(device, non_blocking=True) N = trn_X.size(0) # phase 2. architect step (alpha) alpha_optim.zero_grad() architect.unrolled_backward(config, trn_X, trn_y, val_X, val_y, lr, w_optim) alpha_optim.step() # phase 1. child network step (w) w_optim.zero_grad() logits = model(trn_X) loss = model.module.criterion(logits, trn_y) loss.backward() # gradient clipping nn.utils.clip_grad_norm_(model.module.weights(), config.w_grad_clip) if config.dist_privacy: # privice gradient clipping clipping_dispatcher(model.module.named_weights(), config.max_weights_grad_norm, config.var_gamma, device, logger ) w_optim.step() prec1, prec5 = utils.accuracy(logits, trn_y, topk=(1, 5)) losses.update(loss.item(), N) top1.update(prec1.item(), N) top5.update(prec5.item(), N) if step % config.print_freq == 0 or step == len(train_loader)-1: logger.info( "Train: [{:2d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( epoch+1, config.epochs, step, len(train_loader)-1, losses=losses, top1=top1, top5=top5)) writer.add_scalar('train/loss', loss.item(), cur_step) writer.add_scalar('train/top1', prec1.item(), cur_step) writer.add_scalar('train/top5', prec5.item(), cur_step) cur_step += 1 logger.info("Train: [{:2d}/{}] Final Prec@1 {:.4%}".format(epoch+1, config.epochs, top1.avg))
def test(test_loader, model, criterion, epoch, minimal_error): batch_time = utils.AverageMeter() data_time = utils.AverageMeter() losses = utils.AverageMeter() error = utils.AverageMeter() with torch.no_grad(): model.eval() end = time.time() for batch_idx, input in enumerate(test_loader): data_time.update(time.time() - end) data, target = tuple(input[:len(input) - 1]), input[-1] if args.cuda: data, target = tuple(map(lambda x: x.cuda(), data)), target.cuda() output = model(*data) target = target.view(target.size(0), -1) loss = criterion(output, target) point_error = compute_error(output, target) losses.update(loss.item(), data[0].size(0)) error.update(point_error, data[0].size(0)) batch_time.update(time.time() - end) end = time.time() # print the intermediate results if batch_idx % args.print_freq == 0: logging.info( 'Time({}:{:.0f}), Test Epoch [{}]: [{}/{}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.3f} ({loss.avg:.3f})\t' 'Error {error.val:.3f} ({error.avg:.3f})'.format( time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())), time.time() % 60, epoch, batch_idx, len(test_loader), batch_time=batch_time, data_time=data_time, loss=losses, error=error)) writer.add_scalar('Test/Loss', losses.avg, epoch) writer.add_scalar('Test/Error', error.avg, epoch) logging.info( ' * Test Error {error.avg:.3f} Minimal_error {minimal_error:.3f}'. format(error=error, minimal_error=minimal_error)) return error.avg
def train(self, model, epoch, optim_obj='Weights', search_stage=0): assert optim_obj in ['Weights', 'Arch'] objs = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() sub_obj_avg = utils.AverageMeter() data_time = utils.AverageMeter() batch_time = utils.AverageMeter() model.train() start = time.time() if optim_obj == 'Weights': prefetcher = data_prefetcher(self.train_data) elif optim_obj == 'Arch': prefetcher = data_prefetcher(self.val_data) input, target = prefetcher.next() step = 0 while input is not None: input, target = input.cuda(), target.cuda() data_t = time.time() - start n = input.size(0) if optim_obj == 'Weights': self.scheduler.step() if step == 0: logging.info( 'epoch %d weight_lr %e', epoch, self.search_optim.weight_optimizer.param_groups[0] ['lr']) logits, loss, sub_obj = self.search_optim.weight_step( input, target, model, search_stage) elif optim_obj == 'Arch': if step == 0: logging.info( 'epoch %d arch_lr %e', epoch, self.search_optim.arch_optimizer.param_groups[0]['lr']) logits, loss, sub_obj = self.search_optim.arch_step( input, target, model, search_stage) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) del logits, input, target batch_t = time.time() - start objs.update(loss, n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) sub_obj_avg.update(sub_obj) data_time.update(data_t) batch_time.update(batch_t) if step != 0 and step % self.args.report_freq == 0: logging.info( 'Train%s epoch %03d step %03d | loss %.4f %s %.2f top1_acc %.2f top5_acc %.2f | batch_time %.3f data_time %.3f', optim_obj, epoch, step, objs.avg, self.sub_obj_type, sub_obj_avg.avg, top1.avg, top5.avg, batch_time.avg, data_time.avg) start = time.time() step += 1 input, target = prefetcher.next() return top1.avg, top5.avg, objs.avg, sub_obj_avg.avg, batch_time.avg
def train(data_loader, model, criterion, optimizer, epoch, stage, logger, args): loss_avg = utils.AverageMeter() top1_res = utils.AverageMeter() top5_res = utils.AverageMeter() global_step = epoch * len(data_loader) model.train() logger.log("stage: {}".format(stage)) for step, (images, labels) in enumerate(data_loader): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) num_samples = images.size(0) optimizer.zero_grad() logits = model(images) loss = criterion(logits, labels) prec1_res, prec5_res = utils.accuracy(logits.detach(), labels, topk=(1, 5)) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) loss_avg.update(loss.detach().data.item(), num_samples) loss.backward() optimizer.step() epochs = args.baseline_epochs if step % 100 == 0 or step == len(data_loader) - 1: logger.log("Train, Epoch: [{:3d}/{}], Step: [{:3d}/{}], " \ "Loss: {:.4f}, Prec@(res1, res5): {:.4%}, {:.4%}".format( epoch, epochs, step, len(data_loader), loss_avg.avg, top1_res.avg, top5_res.avg)) global_step += 1 logger.log("Train, Epoch: [{:3d}/{}], Step: [{:3d}/{}], " \ "Loss: {:.4f}, Prec@(res1, res5): {:.4%}, {:.4%}".format( epoch, epochs, step, len(data_loader), loss_avg.avg, top1_res.avg, top5_res.avg))
def run_epoch(epoch, dataloader, model, criterion, args, optimizer=None, is_train=True): batch_time = utils.AverageMeter('Time', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') progress = utils.ProgressMeter(len(dataloader), batch_time, losses, prefix="Epoch: [{}]".format(epoch)) time_shift = args.time_shift end = time.time() for i, data in enumerate(dataloader): utt_list, feats, _, _, _, _, _ = data mask_size = feats.size(1)-time_shift if args.net_type == 'tsfm': uni_mask = (feats[:,:-time_shift,:] != 0)[:,:,0].unsqueeze(-2).byte() & subsequent_mask(mask_size) if args.use_gpu: feats = feats.cuda() if args.net_type == 'tsfm': uni_mask = uni_mask.cuda() if args.net_type == 'tsfm': outputs = model(feats[:, :-time_shift, :], uni_mask) if args.net_type == 'rnn': outputs = model(feats[:, :-time_shift, :]) mask = feats[:,:-time_shift,:] != 0 loss = criterion(outputs.masked_select(mask), feats[:, time_shift:, :].masked_select(mask)) if is_train: optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item(), feats.size(0)) batch_time.update(time.time() - end) if i % args.print_freq == 0: progress.print(i) return losses.avg
def infer(self, model, epoch): objs = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() sub_obj_avg = utils.AverageMeter() data_time = utils.AverageMeter() batch_time = utils.AverageMeter() model.train() # don't use running_mean and running_var during search start = time.time() prefetcher = data_prefetcher(self.val_data) input, target = prefetcher.next() step = 0 while input is not None: step += 1 data_t = time.time() - start n = input.size(0) logits, loss, sub_obj = self.search_optim.valid_step( input, target, model) prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) batch_t = time.time() - start objs.update(loss, n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) sub_obj_avg.update(sub_obj) data_time.update(data_t) batch_time.update(batch_t) if step % self.args.report_freq == 0: logging.info( 'Val epoch %03d step %03d | loss %.4f %s %.2f top1_acc %.2f top5_acc %.2f | batch_time %.3f data_time %.3f', epoch, step, objs.avg, self.sub_obj_type, sub_obj_avg.avg, top1.avg, top5.avg, batch_time.avg, data_time.avg) start = time.time() input, target = prefetcher.next() return top1.avg, top5.avg, objs.avg, sub_obj_avg.avg, batch_time.avg
def train(train_loader, encoder, decoder, criterion, encoder_optim, decoder_optim, epoch, opt, num): """ train for one epoch on the training set """ batch_time = utils.AverageMeter() losses = utils.AverageMeter() # training mode encoder.train() decoder.train() end = time.time() global sim_rec global loss_rec teaching_ratio = opt.teaching_ratio for i, (vfeat, afeat) in enumerate(train_loader): bs = vfeat.size()[0] seq_length = 120 # reverse vfeat for j in range(59): vfeat[:, j, :], vfeat[:, 119 - j, :] = vfeat[:, 119 - j, :], vfeat[:, j, :] # do PCA vfeat = pca_tensor(vfeat, pr=False, dim=1024, feat='vfeat') afeat = pca_tensor(afeat, pr=False, dim=128, feat='afeat') vfeat = Variable(vfeat) target = Variable(afeat) encoder_optim.zero_grad() decoder_optim.zero_grad() loss = 0 # if you have gpu, then shift data to GPU if opt.cuda: vfeat = vfeat.cuda() target = target.cuda() # use video features to generate encoder_output and encoder_hidden (to be the initial hidden for decoder) encoder_hidden = encoder.init_hidden(batch_size=bs) encoder_output, encoder_hidden = encoder(vfeat, encoder_hidden) # decoder decoder_hidden = encoder_hidden decoder_input = encoder_output[:, 119, :].clone() # bs * 128 if opt.cuda: decoder_input = decoder_input.cuda() decoder_context = torch.mean(encoder_output, dim=1) # bs * 128 teaching = random.random() < teaching_ratio if teaching: for seq in range(seq_length): audio_output, decoder_context, decoder_hidden, attn_weights = decoder( decoder_input, decoder_context, decoder_hidden, encoder_output, seq=seq) loss += criterion(audio_output, target[:, seq, :]) decoder_input = target[:, seq, :] else: for seq in range(seq_length): audio_output, decoder_context, decoder_hidden, attn_weights = decoder( decoder_input, decoder_context, decoder_hidden, encoder_output, seq=seq) loss += criterion(audio_output, target[:, seq, :]) decoder_input = audio_output loss = loss / seq_length loss_rec.append(loss.data[0]) losses.update(loss.data[0], vfeat.size(0)) loss.backward() torch.nn.utils.clip_grad_norm(encoder.parameters(), opt.gradient_clip) torch.nn.utils.clip_grad_norm(decoder.parameters(), opt.gradient_clip) encoder_optim.step() decoder_optim.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % opt.print_freq == 0: log_str = 'No.{} Epoch: [{}][{}/{}]\t Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t Loss {loss.val:.4f} ({loss.avg:.4f})'.format( num, epoch, i, len(train_loader), batch_time=batch_time, loss=losses) print(log_str)
def train(train_loader, valid_loader, model, arch, w_optim, alpha_optim, lr, epoch): top1 = utils.AverageMeter() top5 = utils.AverageMeter() losses = utils.AverageMeter() cur_step = epoch * len(train_loader) tb_writer.add_scalar('train/lr', lr, cur_step) model.train() for step, ((train_X, train_y), (valid_X, valid_y)) in enumerate(zip(train_loader, valid_loader)): train_X, train_y = train_X.to(device, non_blocking=True), train_y.to( device, non_blocking=True) valid_X, valid_y = valid_X.to(device, non_blocking=True), valid_y.to( device, non_blocking=True) N = train_X.size(0) # arch step (alpha training) alpha_optim.zero_grad() arch.unrolled_backward(train_X, train_y, valid_X, valid_y, lr, w_optim) alpha_optim.step() # child network step (w) w_optim.zero_grad() logits = model(train_X) loss = model.criterion(logits, train_y) loss.backward() # gradient clipping nn.utils.clip_grad_norm_(model.weights(), config.w_grad_clip) w_optim.step() prec1, prec5 = utils.accuracy(logits, train_y, topk=(1, 5)) losses.update(loss.item(), N) top1.update(prec1.item(), N) top5.update(prec5.item(), N) if step % config.print_freq == 0 or step == len(train_loader) - 1: print("\r", end="", flush=True) logger.info( "Train: [{:2d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( epoch + 1, config.epochs, step, len(train_loader) - 1, losses=losses, top1=top1, top5=top5)) else: print( "\rTrain: [{:2d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format( epoch + 1, config.epochs, step, len(train_loader) - 1, losses=losses, top1=top1, top5=top5), end="", flush=True) tb_writer.add_scalar('train/loss', loss.item(), cur_step) tb_writer.add_scalar('train/top1', prec1.item(), cur_step) tb_writer.add_scalar('train/top5', prec5.item(), cur_step) if step % (config.print_freq // 5) == 0 or step == len( train_loader) - 1: # not too much logging for i, tensor in enumerate(model.alpha_normal): for j, lsn in enumerate(F.softmax(tensor, dim=-1)): tb_writer.add_scalars( 'alpha_normal/%d ~~ %d' % ((j - 2), i), { 'max_pl3': lsn[0], 'avg_pl3': lsn[1], 'skip_cn': lsn[2], 'sep_conv3': lsn[3], 'sep_conv5': lsn[4], 'dil_conv3': lsn[5], 'dil_conv5': lsn[6], 'none': lsn[7] }, cur_step) for i, tensor in enumerate(model.alpha_reduce): for j, lsr in enumerate(F.softmax(tensor, dim=-1)): tb_writer.add_scalars( 'alpha_reduce/%d ~~ %d' % ((j - 2), i), { 'max_pl3': lsr[0], 'avg_pl3': lsr[1], 'skip_cn': lsr[2], 'sep_conv3': lsr[3], 'sep_conv5': lsr[4], 'dil_conv3': lsr[5], 'dil_conv5': lsr[6], 'none': lsr[7] }, cur_step) cur_step += 1 logger.info("Train: [{:2d}/{}] Final Prec@1 {:.4%}".format( epoch + 1, config.epochs, top1.avg))
def run_epoch(epoch, dataloader, model, criterion_ctc, criterion_phrase, args, optimizer=None, is_train=True): batch_time = utils.AverageMeter('Time', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') ctc_losses = utils.AverageMeter('CtcLoss', ":.4e") phrase_losses = utils.AverageMeter('PIDLoss', ":.4e") phone_wers = utils.AverageMeter('Phone_WER', ':.4f') pid_accs = utils.AverageMeter('PID_Acc', ':.4f') progress = utils.ProgressMeter(len(dataloader), batch_time, losses, ctc_losses, phone_wers, phrase_losses, pid_accs, prefix="Epoch: [{}]".format(epoch)) end = time.time() for i, data in enumerate(dataloader): utt_list, feats, labels, feat_sizes, label_sizes, phrase_label, _ = data batch_size, mask_size, _ = feats.size() feat_sizes /= 2 #uni_mask = (feats != 0)[:,:,0].unsqueeze(-2).byte() & subsequent_mask(mask_size) if args.use_gpu: feats = feats.cuda() #uni_mask = uni_mask.cuda() labels = labels.cuda() feat_sizes = feat_sizes.cuda() label_sizes = label_sizes.cuda() phrase_label = phrase_label.cuda() ctc_out, phrase_out, _ = model(feats, feat_sizes) ctc_loss = criterion_ctc(ctc_out.transpose(0, 1), labels, feat_sizes, label_sizes) phrase_loss = criterion_phrase(phrase_out, phrase_label.view(-1)) loss = ctc_loss + args.phrase_lambda * phrase_loss batch_errs, batch_tokens = model.compute_wer( torch.max(ctc_out, dim=-1)[1].cpu().numpy(), feat_sizes.cpu().numpy(), labels.cpu().numpy(), label_sizes.cpu().numpy()) phone_wers.update(batch_errs / batch_tokens, batch_tokens) correct = torch.sum( torch.argmax(phrase_out, dim=-1).view(-1) == phrase_label.view( -1)).item() batch_num = torch.sum((phrase_label != -1)).item() + 0.0000001 pid_accs.update(correct / batch_num, batch_num) if is_train: optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item(), batch_size) ctc_losses.update(ctc_loss.item(), batch_size) phrase_losses.update(phrase_loss.item(), batch_size) batch_time.update(time.time() - end) if i % args.print_freq == 0: progress.print(i) return losses.avg, phone_wers.avg
def train(train_loader, train_mpii_loader, model, criterion, optimizer, epoch): batch_time = utils.AverageMeter() data_time = utils.AverageMeter() losses = utils.AverageMeter() mpii_losses = utils.AverageMeter() point_errors = utils.AverageMeter() angle_errors = utils.AverageMeter() model.train() end = time.time() #TODO: add MPIIGaze dataset iteration train_mpii_iterator = iter(train_mpii_loader) for batch_idx, input in enumerate(train_loader): data_time.update(time.time() - end) data, target = tuple(input[:len(input) - 1]), input[-1] if args.cuda: data, target = tuple(map(lambda x: x.cuda(), data)), target.cuda() output = model(*data) target = target.view(target.size(0), -1) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() # Measure point error and record loss point_error = compute_error(output, target) losses.update(loss.item(), data[0].size(0)) point_errors.update(point_error, data[0].size(0)) batch_time.update(time.time() - end) end = time.time() writer.add_scalar('Train/Loss', losses.avg, batch_idx + len(train_loader) * epoch) writer.add_scalar('Train/Error', point_errors.avg, batch_idx + len(train_loader) * epoch) # print the intermediate results if batch_idx % args.print_freq == 0: logging.info('Time({}:{:.0f}), Train Epoch [{}]: [{}/{}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.3f} ({loss.avg:.3f})\t' 'Error {error.val:.3f} ({error.avg:.3f})'.format( time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())), time.time() % 60, epoch, batch_idx, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, error=point_errors)) #TODO: MPIIGaze dataset iteration (mpii_input, mpii_target), train_mpii_iterator = utils.infinite_get( train_mpii_iterator, train_mpii_loader) if args.cuda: mpii_input, mpii_target = tuple(map( lambda x: x.cuda(), mpii_input)), mpii_target.cuda() mpii_output = model(*mpii_input) mpii_target = mpii_target.view(mpii_target.size(0), -1) #TODO: add coplanar loss mpii_loss = criterion(mpii_output, mpii_target) + 0.000001 * criterion( distance, mpii_target[2]) optimizer.zero_grad() mpii_loss.backward() optimizer.step() # Measure angle error and record loss angle_error = compute_angle_error(mpii_output, mpii_target) mpii_losses.update(mpii_loss.item(), data[0].size(0)) angle_errors.update(angle_error, mpii_input[0].size(0)) writer.add_scalar('Train/MPII_Loss', mpii_losses.avg, batch_idx + len(train_loader) * epoch) writer.add_scalar('Train/Angle_Error', angle_errors.avg, batch_idx + len(train_loader) * epoch) # print the intermediate results if batch_idx % args.print_freq == 0: logging.info('Time({}:{:.0f}), Train Epoch [{}]: [{}/{}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.3f} ({loss.avg:.3f})\t' 'Error {error.val:.3f} ({error.avg:.3f})'.format( time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())), time.time() % 60, epoch, batch_idx, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, error=angle_errors))
def train(train_loader, model, criterion, optimizer, epoch, opt): """ train for one epoch on the training set """ batch_time = utils.AverageMeter() losses = utils.AverageMeter() # training mode model.train() right = 0 total = 0 end = time.time() for i, (vfeat, afeat) in enumerate(train_loader): # shuffling the index orders bz = vfeat.size()[0] orders = np.arange(bz).astype('int32') shuffle_orders = orders.copy() np.random.shuffle(shuffle_orders) if (shuffle_orders == orders).all(): np.random.shuffle(shuffle_orders) label1 = (orders == shuffle_orders + 0).astype('float32') shuffle_orders = torch.from_numpy(shuffle_orders).long() # more negative data augment_order = torch.from_numpy(np.random.permutation(11)).long() # augment_order=torch.from_numpy(np.random.permutation(120)).long() vfeat3_source = vfeat[shuffle_orders] vfeat3 = vfeat3_source[:, augment_order, :] afeat3_source = afeat[shuffle_orders] afeat3 = afeat3_source[:, augment_order, :] # creating a new data with the shuffled indices afeat2 = afeat[shuffle_orders].clone() # concat the vfeat and afeat respectively afeat0 = torch.cat((afeat, afeat2, afeat3), 0) vfeat0 = torch.cat((vfeat, vfeat, vfeat3), 0) # generating the labels # 1. the labels for the shuffled feats target1 = torch.from_numpy(label1) # 2. the labels for the original feats label2 = label1.copy() label2[:] = 1 target2 = torch.from_numpy(label2) # concat the labels together target = torch.cat((target2, target1, torch.zeros(target1.size(0))), 0) target = 1 - target # transpose the feats vfeat0 = vfeat0.transpose(2, 1) afeat0 = afeat0.transpose(2, 1) # put the data into Variable vfeat_var = Variable(vfeat0) afeat_var = Variable(afeat0) target_var = Variable(target) # if you have gpu, then shift data to GPU if opt.cuda: vfeat_var = vfeat_var.cuda() afeat_var = afeat_var.cuda() target_var = target_var.cuda() # forward, backward optimize sim = model(vfeat_var, afeat_var) # inference simialrity loss = criterion(sim, target_var) # compute contrastive loss res = (sim > 0.5).float() right += torch.sum(res == target_var).data[0] ############################## # update loss in the loss meter ############################## losses.update(loss.data[0], vfeat0.size(0)) ############################## # compute gradient and do sgd ############################## optimizer.zero_grad() loss.backward() ############################## # gradient clip stuff ############################## #utils.clip_gradient(optimizer, opt.gradient_clip) # update parameters optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % opt.print_freq == 0: log_str = 'Epoch: [{0}][{1}/{2}]\t Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, i, len(train_loader), batch_time=batch_time, loss=losses) print(log_str) print(right / 1270 / 3, 1270 * 3 - right)
def main(): if not torch.cuda.is_available(): print('no gpu device available') sys.exit(1) if args.random_seed: args.seed = np.random.randint(0, 1000, 1) # reproducible ,再次运行代码时,初始化值不变。 #you should ensure that all other libraries your code relies on and which use random numbers also use a fixed seed. np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) criterion = torch.nn.BCEWithLogitsLoss().cuda() ## in_channels是特征维度 !! model = Network(args.init_channels, args.classes, args.num_cells, criterion, args.n_steps, in_channels=args.in_channels).cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) num_edges = model._steps * 2 post_train = 5 args.epochs = args.warmup_dec_epoch + args.decision_freq * ( num_edges - 1) + post_train + 1 logging.info("total epochs: %d", args.epochs) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) normal_selected_idxs = torch.tensor(len(model.alphas_normal) * [-1], requires_grad=False, dtype=torch.int).cuda() normal_candidate_flags = torch.tensor(len(model.alphas_normal) * [True], requires_grad=False, dtype=torch.bool).cuda() logging.info('normal_selected_idxs: {}'.format(normal_selected_idxs)) logging.info('normal_candidate_flags: {}'.format(normal_candidate_flags)) model.normal_selected_idxs = normal_selected_idxs model.normal_candidate_flags = normal_candidate_flags print(F.softmax(torch.stack(model.alphas_normal, dim=0), dim=-1).detach()) normal_probs_history = [] train_losses, valid_losses = utils.AverageMeter(), utils.AverageMeter() for epoch in range(args.epochs): lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) # training train_acc, train_loss = train(model, architect, criterion, optimizer, lr) print("!!!!!!!!!!!!!!!!train_loss:", train_loss) valid_acc, valid_losses = infer(model, criterion, valid_losses) logging.info('train_acc %f\tvalid_acc %f', train_acc, valid_acc) # make edge decisions saved_memory_normal, model.normal_selected_idxs, \ model.normal_candidate_flags = edge_decision('normal', model.alphas_normal, model.normal_selected_idxs, model.normal_candidate_flags, normal_probs_history, epoch, model, args) writer.add_scalar('stats/train_acc', train_acc, epoch) writer.add_scalar('stats/valid_acc', valid_acc, epoch) utils.save(model, os.path.join(args.save, 'search_weights.pt')) scheduler.step() logging.info("#" * 30 + " Done " + "#" * 30) logging.info('genotype = %s', model.get_genotype())
def train(train_loader, model, criterion, optimizer, epoch, opt, eponum): """ train for one epoch on the training set """ batch_time = utils.AverageMeter() losses = utils.AverageMeter() # training mode model.train() model.float() loss_rec = [] end = time.time() for i, (data, label) in enumerate(train_loader): #model.train() #model.float() # shuffling the index orders bz = label.size()[0] orders = np.arange(bz).astype('int32') shuffle_orders = orders.copy() np.random.shuffle(shuffle_orders) # creating a new data with the shuffled indices data = data[torch.from_numpy(shuffle_orders).long()].clone() label = label[torch.from_numpy(shuffle_orders).long()].clone() # concat the vfeat and afeat respectively # generating the labels # 1. the labels for the shuffled feats # 2. the labels for the original feats #label = label.astype(np.int64) #label = torch.from_numpy(label) label = label.view(label.size(0)) #one_hot = torch.zeros(np.shape(label)[0], 10).scatter_(1, label, 1) #one_hot = one_hot.type(torch.LongTensor) one_hot = torch.LongTensor(label) # transpose the feats # vfeat0 = vfeat0.transpose(2, 1) # afeat0 = afeat0.transpose(2, 1) # put the data into Variable data_var = Variable(data) target_var = Variable(one_hot) # if you have gpu, then shift data to GPU if opt.cuda: data_var = data_var.cuda() target_var = target_var.cuda() # forward, backward optimize sim = model(data_var) # inference simialrity #print(sim) #print(target_var) loss = criterion(sim, target_var) # compute contrastive loss loss_rec.append(list(loss.data)[0]) ############################## # update loss in the loss meter ############################## losses.update(loss.data[0], label.size(0)) ############################## # compute gradient and do sgd ############################## optimizer.zero_grad() loss.backward() ############################## # gradient clip stuff ############################## # utils.clip_gradient(optimizer, opt.gradient_clip) # update parameters optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % opt.print_freq == 0: log_str = 'Fold:[{3}] Epoch: [{0}][{1}/{2}]\t Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t Loss {loss.val:.4f} ({loss.avg:.4f})'.format( eponum, i, len(train_loader), epoch, batch_time=batch_time, loss=losses) print(log_str)
def train(data_loader, model, criterion, optimizer_t, optimizer_s, epoch, stage, logger, args): [loss_avg, mse_avg, top1_cnn, top5_cnn, top1_res, top5_res] = [utils.AverageMeter() for _ in range(6)] global_step = epoch * len(data_loader) model.train() logger.log("stage: {}".format(stage)) m = Cosine(min_v=args.dc, max_v=1.0, epoch=epoch, epoch_max=60) #m = 1.0 model.module.reset_margin() for step, (images, labels) in enumerate(data_loader): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) num_samples = images.size(0) if optimizer_t is not None: optimizer_t.zero_grad() if optimizer_s is not None: optimizer_s.zero_grad() if "TA" in stage: ###train teacher##################### model.module.teacher.eval() logits_teacher, teacher_feas = model(images, stage='RES_TA', epoch=epoch) #logits_teacher, _ = model(images, stage='RES_TA', epoch=epoch) model.module.teacher.eval() ##################################### logits_student, _, loss_dis = model(images, stage=stage, epoch=epoch, teacher_feas=teacher_feas[-1]) loss = 0 loss_last = criterion(logits_student[-1], labels) loss_avg.update(loss_last.detach().item(), num_samples) loss += loss_last loss += loss_dis[-1].mean() * args.dis_weight #10^-3 for 32x32 image #10^-4 for 224x224 scale classification task #10^-5 for detection and segmentation task if isinstance(logits_student, list): prec1_cnn, prec5_cnn = utils.accuracy( logits_student[-1].detach(), labels, topk=(1, 5)) else: prec1_cnn, prec5_cnn = utils.accuracy(logits_student.detach(), labels, topk=(1, 5)) prec1_cnn, prec5_cnn = utils.accuracy(logits_student[-1].detach(), labels, topk=(1, 5)) prec1_res, prec5_res = utils.accuracy(logits_teacher.detach(), labels, topk=(1, 5)) ### teacher is only updated by its own loss loss.backward() optimizer_s.step() top1_cnn.update(prec1_cnn.item(), num_samples) top5_cnn.update(prec5_cnn.item(), num_samples) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) elif "KD" in stage: ###train teacher##################### model.module.teacher.eval() logits_teacher, teacher_feas = model(images, stage='RES_TA', epoch=epoch) #logits_teacher, _ = model(images, stage='RES_TA', epoch=epoch) model.module.teacher.eval() ##################################### logits_student, _, loss_dis = model(images, stage=stage, epoch=epoch, teacher_feas=teacher_feas[-1]) loss = 0 loss += criterion(logits_student[-1], labels) loss_avg.update(loss.detach().item(), num_samples) if loss_dis is not None: for loss_d in loss_dis[:-1]: loss += loss_d.mean() * m * args.dis_weight mse_avg.update(loss_dis[-1].detach().mean().item(), num_samples) loss += loss_dis[-1].mean() * args.dis_weight #10^-3 for 32x32 image #10^-4 for 224x224 scale classification task #10^-5 for detection and segmentation task if isinstance(logits_student, list): prec1_cnn, prec5_cnn = utils.accuracy( logits_student[-1].detach(), labels, topk=(1, 5)) else: prec1_cnn, prec5_cnn = utils.accuracy(logits_student.detach(), labels, topk=(1, 5)) prec1_cnn, prec5_cnn = utils.accuracy(logits_student[-1].detach(), labels, topk=(1, 5)) prec1_res, prec5_res = utils.accuracy(logits_teacher.detach(), labels, topk=(1, 5)) ### teacher is only updated by its own loss loss.backward() optimizer_s.step() top1_cnn.update(prec1_cnn.item(), num_samples) top5_cnn.update(prec5_cnn.item(), num_samples) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) elif "KL" in stage: ###train teacher##################### model.module.teacher.eval() logits_teacher = model(images, stage='RES_NMT', epoch=epoch) #logits_teacher, _ = model(images, stage='RES_TA', epoch=epoch) model.module.teacher.eval() ##################################### logits_student = model(images, stage="CNN_NMT", epoch=epoch) loss = loss_KD_fn(criterion, logits_student, logits_teacher, targets=labels, alpha=args.alpha, temperature=args.temperature) #10^-3 for 32x32 image #10^-4 for 224x224 scale classification task #10^-5 for detection and segmentation task if isinstance(logits_student, list): prec1_cnn, prec5_cnn = utils.accuracy(logits_student.detach(), labels, topk=(1, 5)) else: prec1_cnn, prec5_cnn = utils.accuracy(logits_student.detach(), labels, topk=(1, 5)) prec1_cnn, prec5_cnn = utils.accuracy(logits_student.detach(), labels, topk=(1, 5)) prec1_res, prec5_res = utils.accuracy(logits_teacher.detach(), labels, topk=(1, 5)) ### teacher is only updated by its own loss loss.backward() optimizer_s.step() top1_cnn.update(prec1_cnn.item(), num_samples) top5_cnn.update(prec5_cnn.item(), num_samples) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) elif "JOINT" in stage: ## teacher and student are jointly trained from scratch ###train teacher##################### model.module.teacher.train() optimizer_t.zero_grad() logits_teacher, teacher_feas = model(images, stage='RES_TA', epoch=epoch) #logits_teacher, _ = model(images, stage='RES_TA', epoch=epoch) loss_teacher = criterion(logits_teacher, labels) loss_teacher.backward() optimizer_t.step() model.module.teacher.eval() ##################################### logits_student, _, loss_dis = model(images, stage=stage, epoch=epoch, teacher_feas=teacher_feas[-1]) loss = 0 xishu = 1.0 / 4. for logit_student in logits_student[:-1]: KD_TRAIN = False if KD_TRAIN: loss += loss_KD_fn( criterion, logit_student, logits_teacher, targets=labels, alpha=args.alpha, temperature=args.temperature) * m * xishu else: loss += criterion(logit_student, labels) * m * xishu loss_last = criterion(logits_student[-1], labels) * xishu loss_avg.update(loss_last.detach().item(), num_samples) loss += loss_last if loss_dis is not None: for loss_d in loss_dis[:-1]: loss += loss_d.mean() * m * xishu * args.dis_weight mse_avg.update(loss_dis[-1].detach().mean().item(), num_samples) loss += loss_dis[-1].mean() * args.dis_weight * xishu #10^-3 for 32x32 image #10^-4 for 224x224 scale classification task #10^-5 for detection and segmentation task if isinstance(logits_student, list): prec1_cnn, prec5_cnn = utils.accuracy( logits_student[-1].detach(), labels, topk=(1, 5)) else: prec1_cnn, prec5_cnn = utils.accuracy(logits_student.detach(), labels, topk=(1, 5)) prec1_cnn, prec5_cnn = utils.accuracy(logits_student[-1].detach(), labels, topk=(1, 5)) prec1_res, prec5_res = utils.accuracy(logits_teacher.detach(), labels, topk=(1, 5)) ### teacher is only updated by its own loss loss.backward() #for n, v in model.named_parameters(): # print(n) # print(v.grad.mean()) #pdb.set_trace() optimizer_s.step() top1_cnn.update(prec1_cnn.item(), num_samples) top5_cnn.update(prec5_cnn.item(), num_samples) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) elif "RES_NMT" in stage: logits = model(images, stage='RES_NMT') loss = criterion(logits, labels) prec1_res, prec5_res = utils.accuracy(logits.detach(), labels, topk=(1, 5)) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) loss_avg.update(loss.detach().data.item(), num_samples) loss.backward() optimizer_t.step() elif "CNN_NMT" in stage: logits = model(images, stage=stage) loss = criterion(logits, labels) prec1_cnn, prec5_cnn = utils.accuracy(logits.detach(), labels, topk=(1, 5)) top1_cnn.update(prec1_cnn.item(), num_samples) top5_cnn.update(prec5_cnn.item(), num_samples) loss_avg.update(loss.detach().data.item(), num_samples) loss.backward() optimizer_s.step() elif "RES_KD" in stage: logit_student, logits_teacher = model(images, stage=stage) loss = loss_KD_fn(criterion, logit_student, logits_teacher, targets=labels, alpha=args.alpha, temperature=args.temperature) prec1_res, prec5_res = utils.accuracy(logit_student.detach(), labels, topk=(1, 5)) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) loss_avg.update(loss.detach().data.item(), num_samples) else: raise NameError("invalide stage nanme") epochs = args.baseline_epochs if step % 100 == 0 or step == len(data_loader) - 1: logger.log("Train, Epoch: [{:3d}/{}], Step: [{:3d}/{}], " \ "Loss: {:.4f}, Loss_dis: {:.4f}, Prec@(cnn1, res1, cnn5, res5): {:.4%},{:.4%}, {:.4%}, {:.4%}".format( epoch, epochs, step, len(data_loader), loss_avg.avg, mse_avg.avg, top1_cnn.avg, top1_res.avg, top5_cnn.avg, top5_res.avg)) global_step += 1 logger.log("m is {}".format(m)) logger.log( "Train, Epoch: [{:3d}/{}], Final Prec: cnn, res@1: {:.4%}, {:.4%}, Final Prec: cnn, res@5: {:.4%}, {:.4%} Loss: {:.4f}" .format(epoch, epochs, top1_cnn.avg, top1_res.avg, top5_cnn.avg, top5_res.avg, loss_avg.avg))
def valid(data_loader, model, criterion, epoch, global_step, stage, logger, args): loss_avg = utils.AverageMeter() top1_cnn = utils.AverageMeter() top5_cnn = utils.AverageMeter() top1_res = utils.AverageMeter() top5_res = utils.AverageMeter() global_step = epoch * len(data_loader) model.eval() logger.log("stage: {}".format(stage)) with torch.no_grad(): for step, (images, labels) in enumerate(data_loader): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) num_samples = images.size(0) if "TA" in stage or "JOINT" in stage or "KD" in stage or "KL" in stage: with torch.no_grad(): logits = model(images, stage='CNN_NMT') logits_teacher = model(images, stage='RES_NMT') prec1_cnn, prec5_cnn = utils.accuracy(logits.detach(), labels, topk=(1, 5)) prec1_res, prec5_res = utils.accuracy( logits_teacher.detach(), labels, topk=(1, 5)) loss = criterion(logits, labels) loss_avg.update(loss.detach().item(), num_samples) top1_cnn.update(prec1_cnn.item(), num_samples) top5_cnn.update(prec5_cnn.item(), num_samples) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) elif "RES_NMT" in stage: logits = model(images, stage=stage) loss = criterion(logits, labels) prec1_res, prec5_res = utils.accuracy(logits, labels, topk=(1, 5)) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) loss_avg.update(loss.data.item(), num_samples) elif "CNN_NMT" in stage: logits = model(images, stage=stage) loss = criterion(logits, labels) prec1_cnn, prec5_cnn = utils.accuracy(logits, labels, topk=(1, 5)) top1_cnn.update(prec1_cnn.item(), num_samples) top5_cnn.update(prec5_cnn.item(), num_samples) loss_avg.update(loss.data.item(), num_samples) elif "RES_KD" in stage: logit_student, logits_teacher = model(images, stage=stage) loss = loss_KD_fn(criterion, logit_student, logits_teacher, targets=labels, alpha=args.alpha, temperature=args.temperature) prec1_res, prec5_res = utils.accuracy(logit_student.detach(), labels, topk=(1, 5)) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) loss_avg.update(loss.detach().data.item(), num_samples) else: raise NameError("invalide stage nanme") epochs = args.baseline_epochs if step % 100 == 0 or step == len(data_loader) - 1: logger.log("Valid, Epoch: [{:3d}/{}], Step: [{:3d}/{}], " \ "Loss: {:.4f}, Prec@(cnn1, res1, cnn5, res5): {:.4%},{:.4%}, {:.4%}, {:.4%}".format( epoch, epochs, step, len(data_loader), loss_avg.avg, top1_cnn.avg, top1_res.avg, top5_cnn.avg, top5_res.avg)) global_step += 1 logger.log( "Valid, Epoch: [{:3d}/{}], Final Prec: cnn, res@1: {:.4%}, {:.4%}, Final Prec: cnn, res@5: {:.4%}, {:.4%} Loss: {:.4f}" .format(epoch, epochs, top1_cnn.avg, top1_res.avg, top5_cnn.avg, top5_res.avg, loss_avg.avg)) if "RES" in stage: return top1_res.avg else: return top1_cnn.avg
def train(train_loader, model, criterion, optimizer, epoch, opt, logger=None): """ train for one epoch on the training set """ batch_time = utils.AverageMeter() losses = utils.AverageMeter() # training mode model.train() end = time.time() for i, (vfeat, afeat) in enumerate(train_loader): # shuffling the index orders bz = vfeat.size()[0] orders = np.arange(bz).astype('int32') shuffle_orders = orders.copy() np.random.shuffle(shuffle_orders) # creating a new data with the shuffled indices afeat2 = afeat[torch.from_numpy(shuffle_orders).long()].clone() # concat the vfeat and afeat respectively afeat0 = torch.cat((afeat, afeat2), 0) vfeat0 = torch.cat((vfeat, vfeat), 0) # generating the labels # 1. the labels for the shuffled feats label1 = (orders == shuffle_orders + 0).astype('float32') target1 = torch.from_numpy(label1) # 2. the labels for the original feats label2 = label1.copy() label2[:] = 1 target2 = torch.from_numpy(label2) # concat the labels together target = torch.cat((target2, target1), 0) target = 1 - target # put the data into Variable vfeat_var = Variable(vfeat0).cuda() afeat_var = Variable(afeat0).cuda() target_var = Variable(target).cuda() # forward, backward optimize sim = model(vfeat_var, afeat_var) # inference similarity loss = criterion(sim, target_var) # update loss in the loss meter losses.update(loss.data[0], vfeat0.size(0)) # compute gradient and do sgd optimizer.zero_grad() loss.backward() # update parameters optimizer.step() # logger=None means no logger if logger: logger.add_scalar('loss', loss.data[0], epoch) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % opt.print_freq == 0: print(f'Epoch: [{epoch}][{i}/{len(train_loader)}]\t' f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' f'Loss {losses.val:.4f} ({losses.avg:.4f})') return losses.avg
def train(train_loader, model, criterion, optimizer, epoch, opt): """ train for one epoch on the training set """ batch_time = utils.AverageMeter() losses = utils.AverageMeter() # training mode model.train() end = time.time() for i, (vfeat, afeat) in enumerate(train_loader): #pdb.set_trace() # shuffling the index orders bz = vfeat.size()[0] orders = np.arange(bz) shuffle_orders = orders.copy() np.random.shuffle(shuffle_orders) # creating a new data with the shuffled indices afeat2 = afeat[torch.from_numpy(shuffle_orders)].clone() # concat the vfeat and afeat respectively afeat0 = torch.cat((afeat, afeat2), 0) vfeat0 = torch.cat((vfeat, vfeat), 0) # generating the labels # 1. the labels for the shuffled feats label1 = (orders == shuffle_orders + 0).astype('float32') target1 = torch.from_numpy(label1) # 2. the labels for the original feats label2 = label1.copy() label2[:] = 1 target2 = torch.from_numpy(label2) # concat the labels together target = torch.cat((target2, target1), 0) target = 1 - target # put the data into Variable vfeat_var = Variable(vfeat0) afeat_var = Variable(afeat0) target_var = Variable(target) # if you have gpu, then shift data to GPU if opt.cuda: vfeat_var = vfeat_var.cuda() afeat_var = afeat_var.cuda() target_var = target_var.cuda() #pdb.set_trace() # forward, backward optimize sim = model(vfeat_var, afeat_var) # inference simialrity loss = criterion(sim, target_var) # compute contrastive loss ############################## # update loss in the loss meter ############################## losses.update(loss.data[0], vfeat0.size(0)) ############################## # compute gradient and do sgd ############################## optimizer.zero_grad() loss.backward() ############################## # gradient clip stuff ############################## #utils.clip_gradient(optimizer, opt.gradient_clip) # update parameters optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % opt.print_freq == 0: log_str = 'Epoch: [{0}][{1}/{2}]\t Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t Loss {loss.val:.4f} ({loss.avg:.4f})'.format(epoch, i, len(train_loader), batch_time=batch_time, loss=losses) mylog.info(log_str)
def train(train_loader, model, criterion, optimizer, epoch, opt): """ train for one epoch on the training set """ batch_time = utils.AverageMeter() losses = utils.AverageMeter() # training mode model.train() end = time.time() global dis1_rec global dis2_rec global loss_rec for i, (vfeat, afeat) in enumerate(train_loader): # shuffling the index orders bz = vfeat.size()[0] orders = np.arange(bz).astype('int32') shuffle_orders = orders.copy() np.random.shuffle(shuffle_orders) # creating a new data with the shuffled indices afeat2 = afeat[torch.from_numpy(shuffle_orders).long()].clone() # concat the vfeat and afeat respectively # generating the labels # 1. the labels for the shuffled feats label1 = (orders == shuffle_orders + 0).astype('float32') target1 = torch.from_numpy(label1) # 2. the labels for the original feats label2 = label1.copy() label2[:] = 1 target2 = torch.from_numpy(label2) if np.random.randint(0, 100) % 2 == 0: # concat the labels together afeat0 = torch.cat((afeat, afeat2), 0) vfeat0 = torch.cat((vfeat, vfeat), 0) target = torch.cat((target2, target1), 0) target = 1 - target else: afeat0 = torch.cat((afeat2, afeat), 0) vfeat0 = torch.cat((vfeat, vfeat), 0) target = torch.cat((target1, target2), 0) target = 1 - target target = target.numpy() label = target.astype(np.int64) label = torch.from_numpy(label) label = label.view(label.size(0)) #one_hot = torch.zeros(np.shape(target)[0], 2).scatter_(1, label, 1) one_hot = torch.LongTensor(label) # transpose the feats # vfeat0 = vfeat0.transpose(2, 1) # afeat0 = afeat0.transpose(2, 1) # put the data into Variable vfeat_var = Variable(vfeat0) afeat_var = Variable(afeat0) target_var = Variable(one_hot) # if you have gpu, then shift data to GPU if opt.cuda: vfeat_var = vfeat_var.cuda() afeat_var = afeat_var.cuda() target_var = target_var.cuda() # forward, backward optimize sim = model(vfeat_var, afeat_var, train_status=True) # inference simialrity loss = criterion(sim, target_var) # compute contrastive loss # record the loss and distance to plot later #dis1_rec.append(list(dis1.data)[0]) #dis2_rec.append(list(dis2.data)[0]) loss_rec.append(list(loss.data)[0]) ############################## # update loss in the loss meter ############################## losses.update(loss.data[0], vfeat0.size(0)) ############################## # compute gradient and do sgd ############################## optimizer.zero_grad() loss.backward() ############################## # gradient clip stuff ############################## # utils.clip_gradient(optimizer, opt.gradient_clip) # update parameters optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % opt.print_freq == 0: log_str = 'Epoch: [{0}][{1}/{2}]\t Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, i, len(train_loader), batch_time=batch_time, loss=losses) print(log_str)
def train(data_loader, model, criterion, optimizer_t, optimizer_m, epoch, stage, logger, args, epoch_dict): loss_avg = utils.AverageMeter() mse_avg = utils.AverageMeter() top1_cnn = utils.AverageMeter() top5_cnn = utils.AverageMeter() top1_res = utils.AverageMeter() top5_res = utils.AverageMeter() global_step = epoch * len(data_loader) model.train() if "TA" in stage: model.module.teacher.eval() else: model.module.teacher.train() logger.log("stage: {}".format(stage)) m = Cosine(min_v=0.5, max_v=1., epoch=epoch) for step, (images, labels) in enumerate(data_loader): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) num_samples = images.size(0) optimizer_t.zero_grad() optimizer_m.zero_grad() if "TA" in stage: logits, logits_teacher, loss_dis = model( x=images, stage=stage, epoch=epoch, batch_pro=args.batch_pro, windowsize=args.windowsize) if stage == "TA1": loss = loss_KD_fn(criterion, logits, logits_teacher, targets=labels, alpha=args.alpha, temperature=args.temperature) elif stage == "TA2": loss = 0. for logit_student in logits[:-1]: #loss += loss_KD_fn(criterion, logit_student, logits_teacher, # targets = labels, alpha = args.alpha, temperature = args.temperature) * m * 0.25 loss += criterion(logit_student, labels) * m #loss += loss_KD_fn(criterion, logits[-1], logits_teacher, # targets = labels, alpha = args.alpha, temperature = args.temperature) * (1.0 - 3*m*0.25) loss += criterion(logits[-1], labels) loss_avg.update(loss.detach().item(), num_samples) if loss_dis is not None: for loss_d in loss_dis[:-1]: loss += loss_d.mean() * m * 0.25 * args.dis_weight mse_avg.update(loss_dis[-1].detach().mean().item(), num_samples) loss += loss_dis[-1].mean() * args.dis_weight #10^-3 for 32x32 image #10^-4 for 224x224 scale classification task #10^-5 for detection and segmentation task if isinstance(logits, list): prec1_cnn, prec5_cnn = utils.accuracy(logits[-1].detach(), labels, topk=(1, 5)) else: prec1_cnn, prec5_cnn = utils.accuracy(logits.detach(), labels, topk=(1, 5)) top1_cnn.update(prec1_cnn.item(), num_samples) top5_cnn.update(prec5_cnn.item(), num_samples) elif "RES_NMT" in stage: logits = model(images, stage=stage) loss = criterion(logits, labels) ## train mask mask = [] mask_log = [] for name, param in model.named_parameters(): if 'mask' in name and "teacher" in name: mask.append(param.view(-1)) mask_log.append(param.detach()) mask = torch.cat(mask) error_sparse = args.sparse_lambda * torch.norm(mask, 1) error_sparse.backward() prec1_res, prec5_res = utils.accuracy(logits.detach(), labels, topk=(1, 5)) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) loss_avg.update(loss.detach().data.item(), num_samples) elif "CNN_NMT" in stage: logits = model(images, stage=stage) loss = criterion(logits, labels) prec1_cnn, prec5_cnn = utils.accuracy(logits.detach(), labels, topk=(1, 5)) top1_cnn.update(prec1_cnn.item(), num_samples) top5_cnn.update(prec5_cnn.item(), num_samples) loss_avg.update(loss.detach().data.item(), num_samples) elif "RES_KD" in stage: logit_student, logits_teacher = model(images, stage=stage) loss = loss_KD_fn(criterion, logit_student, logits_teacher, targets=labels, alpha=args.alpha, temperature=args.temperature) prec1_res, prec5_res = utils.accuracy(logit_student.detach(), labels, topk=(1, 5)) top1_res.update(prec1_res.item(), num_samples) top5_res.update(prec5_res.item(), num_samples) loss_avg.update(loss.detach().data.item(), num_samples) else: raise NameError("invalide stage nanme") loss.backward() optimizer_t.step() if epoch >= 1: optimizer_m.step() epochs = epoch_dict[stage] if step % 100 == 0 or step == len(data_loader) - 1: logger.log("Train, Epoch: [{:3d}/{}], Step: [{:3d}/{}], " \ "Loss: {:.4f}, Loss_dis: {:.4f}, Prec@(cnn1, res1, cnn5, res5): {:.4%},{:.4%}, {:.4%}, {:.4%}".format( epoch, epochs, step, len(data_loader), loss_avg.avg, mse_avg.avg, top1_cnn.avg, top1_res.avg, top5_cnn.avg, top5_res.avg)) global_step += 1 logger.log("mask:") logger.log(mask_log) logger.log( "Train, Epoch: [{:3d}/{}], Final Prec: cnn, res@1: {:.4%}, {:.4%}, Final Prec: cnn, res@5: {:.4%}, {:.4%} Loss: {:.4f}" .format(epoch, epochs, top1_cnn.avg, top1_res.avg, top5_cnn.avg, top5_res.avg, loss_avg.avg))
def trainer(train_loader, valid_loader, model, criterion, optimizer_t, optimizer_s=None, lr_scheduler=None, stage=None): logger.log("start training..." + stage) best_top1 = 0.0 epochs = args.baseline_epochs start_time = time.time() epoch_time = utils.AverageMeter() for epoch in range(args.start_epoch, epochs): ##################################adjust learning rate################################## if args.lr_sch == "cosine": if optimizer_t is not None: adjust_learning_rateD(optimizer_t, epoch, epochs, lr_max=args.learning_rate, lr_min=args.learning_rate * 1e-3) if optimizer_s is not None: adjust_learning_rateD(optimizer_s, epoch, epochs, lr_max=args.learning_rate, lr_min=args.learning_rate * 1e-3) elif args.lr_sch == "imagenet": if optimizer_t is not None: adjust_learning_rateA(optimizer_t, epoch, args) if optimizer_s is not None: adjust_learning_rateA(optimizer_s, epoch, args) elif args.lr_sch == "step": if optimizer_t is not None: adjust_learning_rateS(optimizer_t, epoch, args) if optimizer_s is not None: adjust_learning_rateS(optimizer_s, epoch, args) else: raise NameError("lrsch name error") ######################################################################################## lr = optimizer_t.param_groups[0][ "lr"] if optimizer_t else optimizer_s.param_groups[0]["lr"] need_hour, need_mins, need_secs = convert_secs2time(epoch_time.val * (epochs - epoch)) need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format( need_hour, need_mins, need_secs) logger.log(' [{:s}] :: {:3d}/{:3d} ----- [{:s}] {:s} LR={:}'.format( args.smodel_name, epoch, epochs, time_string(), need_time, lr)) train(train_loader, model, criterion, optimizer_t, optimizer_s, epoch, stage, logger, args) global_step = (epoch + 1) * len(train_loader) - 1 valid_top1 = valid(valid_loader, model, criterion, epoch, global_step, stage=stage, logger=logger, args=args) if epoch == 0 or best_top1 < valid_top1: best_top1 = valid_top1 is_best = True else: is_best = False if epoch >= 89: utils.save_checkpoint(model, logger.path('info'), is_best=is_best, pre=args.aim + "_" + "epoch_" + str(epoch) + "_" + stage) epoch_time.update(time.time() - start_time) start_time = time.time() logger.log("Final best valid Prec@1: {:.4%}".format(best_top1))
def train(train_loader, model, criterion, optimizer, epoch, opt, num): """ train for one epoch on the training set """ batch_time = utils.AverageMeter() losses = utils.AverageMeter() # training mode model.train() end = time.time() global positive_rec global negative_rec global loss_rec for i, (vfeat, afeat) in enumerate(train_loader): # shuffling the index orders bz = vfeat.size()[0] orders = np.arange(bz).astype('int32') shuffle_orders = orders.copy() np.random.shuffle(shuffle_orders) # creating a new data with the shuffled indices afeat2 = afeat[torch.from_numpy(shuffle_orders).long()].clone() # concat the vfeat and afeat respectively afeat0 = torch.cat((afeat, afeat2), 0) vfeat0 = torch.cat((vfeat, vfeat), 0) # generating the labels # 1. the labels for the shuffled feats label1 = (orders == shuffle_orders + 0).astype('float32') target1 = torch.from_numpy(label1) # 2. the labels for the original feats label2 = label1.copy() label2[:] = 1 target2 = torch.from_numpy(label2) # concat the labels together target = torch.cat((target2, target1), 0) target = 1 - target # transpose the feats # vfeat0 = vfeat0.transpose(2, 1) # afeat0 = afeat0.transpose(2, 1) # put the data into Variable vfeat_var = Variable(vfeat0) afeat_var = Variable(afeat0) target_var = Variable(target) # if you have gpu, then shift data to GPU if opt.cuda: vfeat_var = vfeat_var.cuda() afeat_var = afeat_var.cuda() target_var = target_var.cuda() sim = model(vfeat_var, afeat_var) loss = criterion(sim, target_var) loss_rec.append(list(loss.data)[0]) positive_rec.append(list(torch.mean(sim[0:bz, 0]).data)[0]) negative_rec.append(list(torch.mean(sim[bz:bz * 2, 0]).data)[0]) # ##### for N pair loss # vfeat = Variable(vfeat) # afeat = Variable(afeat) # if opt.cuda: # vfeat = vfeat.cuda() # afeat = afeat.cuda() # bz = vfeat.size()[0] # for k in np.arange(bz): # cur_vfeat = vfeat[k].clone() # vfeat_k = cur_vfeat.repeat(bz, 1, 1) # sim_k = model(vfeat_k, afeat) # sim_k_0 = sim_k[:, 0] # sim_k_1 = sim_k[:, 1] # sim_k_0 = sim_k_0.resize(1, bz) # sim_k_1 = sim_k_1.resize(1, bz) # if k == 0: # sim_0 = sim_k_0.clone() # sim_1 = sim_k_1.clone() # else: # sim_0 = torch.cat((sim_0, sim_k_0), dim=0) # sim_1 = torch.cat((sim_1, sim_k_1), dim=0) # loss = criterion(sim_0, sim_1) # # loss_rec.append(list(loss.data)[0]) # positive_rec.append(list(torch.mean(torch.diag(sim_0)).data)[0]) # sim_0 = sim_0 - torch.diag(torch.diag(sim_0)) # negative_rec.append(list(torch.mean(sim_0).data)[0]) ############################## # update loss in the loss meter ############################## losses.update(loss.data[0], vfeat.size(0)) ############################## # compute gradient and do sgd ############################## optimizer.zero_grad() loss.backward() ############################## # gradient clip stuff ############################## torch.nn.utils.clip_grad_norm(model.parameters(), opt.gradient_clip) # update parameters optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % opt.print_freq == 0: log_str = 'No.{} Epoch: [{}][{}/{}]\t Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t Loss {loss.val:.4f} ({loss.avg:.4f})'.format( num, epoch, i, len(train_loader), batch_time=batch_time, loss=losses) print(log_str)
def main(): parser = argparse.ArgumentParser( description="Configuration for training an APC model") parser.add_argument("--test_config") parser.add_argument("--use_cmvn", default=False, action='store_true', help="Use cmvn or not") parser.add_argument("--batch_size", default=32, type=int, help="Training minibatch size") parser.add_argument("--load_data_workers", default=2, type=int, help="Number of parallel data loaders") parser.add_argument("--resume_model", default='', type=str, help="Use cmvn or not") parser.add_argument("--print_freq", default=100, type=int, help="Number of iter to print") parser.add_argument("--out_prob", type=str, help="output file to store phrase id log prob") parser.add_argument("--out_embedding", type=str, help="name of output embedding ark and scp file") parser.add_argument("--seed", default=1, type=int, help="random number seed") args = parser.parse_args() with open(args.test_config) as f: config = yaml.safe_load(f) config['path_test'] = [j for i, j in config['test_data'].items()] for key, val in config.items(): setattr(args, key, val) for var in vars(args): config[var] = getattr(args, var) print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) use_cuda = args.use_gpu torch.manual_seed(args.seed) np.random.seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) enc_args = Config() for key, val in args.encoder.items(): setattr(enc_args, key, val) shared_encoder = make_se(enc_args.input_size, enc_args.N, enc_args.N_embed, enc_args.d_model, enc_args.d_ff, enc_args.h, enc_args.dropout) dec_args = Config() for key, val in args.asr_decoder.items(): setattr(dec_args, key, val) dec_args.d_model = enc_args.d_model model = make_model(shared_encoder, dec_args) if args.resume_model: resume_model = torch.load(args.resume_model, map_location='cpu') model.load_state_dict(resume_model) num_params = 0 for name, param in model.named_parameters(): num_params += param.numel() print("Number of parameters: {}".format(num_params)) if use_cuda: model = model.cuda() testset = SpeechDataset(args.path_test, args.left_ctx, args.right_ctx, args.skip_frame) if args.use_cmvn: testset._load_cmvn(args.global_cmvn) test_loader = SpeechDataLoader(testset, args.batch_size, num_workers=args.load_data_workers, shuffle=False) print("Finish Loading test files. Number batches: {}".format( len(test_loader))) batch_time = utils.AverageMeter('Time', ':6.3f') progress = utils.ProgressMeter(len(test_loader), batch_time) end = time.time() ark_writer = WriteHelper('ark,scp:{}.ark,{}.scp'.format( args.out_embedding, args.out_embedding)) prob_writer = open(args.out_prob, 'w') with torch.no_grad(): model.eval() for i, data in enumerate(test_loader): utt_list, feats, _, feat_sizes, _, _, _ = data batch_size, mask_size, _ = feats.size() feat_sizes /= 2 if args.use_gpu: feats = feats.cuda() feat_sizes = feat_sizes.cuda() _, phrase_out, t_embedding = model(feats, feat_sizes) logprob = F.log_softmax(phrase_out, dim=-1) for j in range(len(utt_list)): ark_writer(utt_list[j], t_embedding[j].cpu().numpy()) prob_writer.write(utt_list[j] + ' ' + str(logprob[j].cpu().numpy()) + '\n') batch_time.update(time.time() - end) if i % args.print_freq == 0: progress.print(i)