def main(global_step=global_step): train_dataset = dataset.get_train_dataset(src_file=config.train_src_file, tgt_file=config.train_tgt_file, tgt_vocab_table=tgt_vocab_table, batch_size=config.batch_size) init_acc = 0 if config.eval_only: logger.info("======== Evaluation only ===============") test_acc = infer() logger.info("Test acc {:.4f}".format(test_acc)) else: for epoch in range(config.max_epochs): total_loss, total_cnt, step_time = 0.0, 0.0, 0.0 for batch_data in train_dataset.take(config.steps_per_epoch): start_time = time.time() src_inputs, tgt_input_ids, tgt_output_ids, src_len, tgt_len = batch_data batch_size = src_inputs.shape[0] batch_loss = train_step(batch_data) total_loss += batch_loss * batch_size total_cnt += batch_size step_time += time.time() - start_time if (global_step + 1) % 100 == 0: train_loss = total_loss / total_cnt speed = total_cnt / step_time logger.info("epoch {} global_step {} example-time {:.2f} total loss: {:.4f}". format(epoch, global_step + 1, speed, train_loss)) total_loss, total_cnt, step_time = 0.0, 0.0, 0.0 global_step += 1 test_acc = infer() checkpoint.save(file_prefix=chkpoint_prefix + "_acc_{:.4f}".format(test_acc) + "-" + str(global_step)) logger.info("Saving model to {}".format( chkpoint_prefix + "_acc_{:.4f}".format(test_acc) + "-" + str(global_step))) if test_acc > init_acc: checkpoint.save( file_prefix=best_output + "_acc_{:.4f}".format(test_acc) + "-" + str(global_step)) init_acc = test_acc logger.info("Currently the best acc {:.4f}".format(test_acc))
def main(global_step=global_step): train_dataset = dataset.get_train_dataset(src_file=config.train_src_file, tgt_file=config.train_tgt_file, tgt_vocab_table=tgt_vocab_table, batch_size=config.batch_size) init_bleu = 0 if config.eval_only: logger.info("======== Evaluation only ===============") eval_bleu, eval_loss = eval() test_bleu = infer() logger.info("Eval loss {:.4f}, bleu {:.4f}".format( eval_loss, eval_bleu)) logger.info("Test bleu {:.4f}".format(test_bleu)) else: for epoch in range(global_epoch + 1, config.max_epochs): total_loss, total_cnt, step_time = 0.0, 0.0, 0.0 for batch_data in train_dataset.take(config.steps_per_epoch): start_time = time.time() src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data batch_size = src_inputs.shape[0] batch_loss = train_step(batch_data) total_loss += batch_loss * batch_size total_cnt += batch_size step_time += time.time() - start_time if (global_step + 1) % 100 == 0: train_loss = total_loss / total_cnt train_ppl = misc_utils.safe_exp(total_loss / total_cnt) speed = total_cnt / step_time # current_lr = learning_rate(global_step) logger.info( "epoch {} global_step {} example-time {:.2f} total loss: {:.4f} ppl {:.4f}" .format(epoch, global_step + 1, speed, train_loss, train_ppl)) if math.isnan(train_ppl): break total_loss, total_cnt, step_time = 0.0, 0.0, 0.0 global_step = tf.add(global_step, 1) eval_bleu, eval_loss = eval() test_bleu = infer() logger.info( "Epoch {}, Internal eval bleu {:.4f} loss {:.4f}, External test bleu {:.4f}" .format(epoch, eval_bleu, eval_loss, test_bleu)) checkpoint.save(file_prefix=chkpoint_prefix + "_bleu_{:.4f}".format(test_bleu) + "-" + str(global_step)) logger.info( "Saving model to {}".format(chkpoint_prefix + "_bleu_{:.4f}".format(test_bleu) + "-" + str(global_step))) if test_bleu > init_bleu: checkpoint.save(file_prefix=best_output + "_bleu_{:.4f}".format(test_bleu) + "-" + str(global_step)) init_bleu = test_bleu logger.info("Currently the best bleu {:.4f}".format(test_bleu))
def main(global_step=global_step): train_dataset = dataset.get_train_dataset(src_file=config.train_src_file, tgt_file=config.train_tgt_file, tgt_vocab_table=tgt_vocab_table, batch_size=config.batch_size) init_wer = 0 if config.eval_only: logger.info("======== Evaluation only ===============") test_wer = infer() logger.info("Test wer {:.4f}".format(test_wer)) else: for epoch in range(config.max_epochs): total_ctc_loss, total_reg_loss, total_cnt, step_time = 0.0, 0.0, 0.0, 0.0 for batch_data in train_dataset.take(config.steps_per_epoch): start_time = time.time() src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data batch_size = src_inputs.shape[0] ctc_loss, reg_loss = train_step(batch_data, epoch, global_step == 0) total_ctc_loss += ctc_loss * batch_size total_reg_loss += reg_loss * batch_size total_cnt += batch_size step_time += time.time() - start_time if (global_step + 1) % 100 == 0: train_ctc_loss = total_ctc_loss / total_cnt train_reg_loss = total_reg_loss / total_cnt speed = total_cnt / step_time logger.info( "epoch {} global_step {} example-time {:.2f} ctc loss: {:.4f} reg loss: {:.4f}" .format(epoch, global_step + 1, speed, train_ctc_loss, train_reg_loss)) total_ctc_loss, total_reg_loss, total_cnt, step_time = 0.0, 0.0, 0.0, 0.0 global_step = tf.add(global_step, 1) test_wer = infer() save_file_prefix = chkpoint_prefix + "_wer_{:.4f}".format( test_wer) + "-" + str(global_step.numpy()) checkpoint.save(save_file_prefix) logger.info("Saving model to {}".format(save_file_prefix)) if test_wer > init_wer: checkpoint.save(file_prefix=best_output_predfix + "_wer_{:.4f}".format(test_wer) + "-" + str(global_step.numpy())) init_wer = test_wer logger.info("Currently the best wer {:.4f}".format(test_wer))
def eval(): """internal evaluation """ dev_dataset = dataset.get_train_dataset(src_file=config.eval_src_file, tgt_file=config.eval_tgt_file, tgt_vocab_table=tgt_vocab_table, batch_size=config.batch_size) total_cnt, total_loss, total_bleu = 0.0, 0.0, 0.0 for batch_num, batch_data in enumerate(dev_dataset.take(config.debug_num)): src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data logits = model(batch_data, training=True) bs = logits.shape[0] xentropy, weights = metrics.padded_cross_entropy_loss( logits, tgt_output_ids, config.label_smoothing, vocab_size=tgt_vocab_size) batch_loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) batch_bleu = metrics.bleu_score(logits=logits, labels=tgt_output_ids) total_cnt += bs total_loss += bs * batch_loss total_bleu += bs * batch_bleu eval_loss = total_loss / total_cnt eval_bleu = total_bleu / total_cnt return eval_bleu, eval_loss
def main_worker(gpus, args): # 定义模型,损失函数,优化器 model = resnet18() torch.cuda.set_device('cuda:{}'.format(gpus[0])) model.cuda() criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=1e-4) # apex initialization model, optimizer = amp.initialize(model, optimizer) # 如果使用的GPU数量大于1,需要用nn.DataParallel来修饰模型 if len(gpus) > 1: model = nn.DataParallel(model, device_ids=gpus, output_device=gpus[0]) # Define Training Schedule and Dataloader train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2) train_dataset = get_train_dataset() train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True) test_dataset = get_test_dataset() test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=4, pin_memory=True) # Training for epoch in range(args.epochs): start = time.time() model.train() # 设置 train_scheduler 来调整学习率 train_scheduler.step(epoch) for step, (images, labels) in enumerate(train_loader): # 将对应进程的数据放到 GPU 上 images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) # 更新优化模型权重 optimizer.zero_grad() # loss.backward() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() print( 'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}' .format(loss, optimizer.param_groups[0]['lr'], epoch=epoch + 1, trained_samples=step * args.batch_size + len(images), total_samples=len(train_loader.dataset))) finish = time.time() print('epoch {} training time consumed: {:.2f}s'.format( epoch, finish - start)) # validate after every epoch validate(test_loader, model, criterion)
if mask is not None: scaled_attention_logits += (mask * 1e-9) # softmax is normalized on the last axis (seq_len_k) so that the scores add up to 1. attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., q_len, kv_len) output = tf.matmul(attention_weights, v) # [.., q_len, d_model] ? [.., k_len, d_model] return output, attention_weights, scaled_attention_logits # print(os.getcwd()) base_path = "/home/panxie/Documents/sign-language/nslt/Data" src_file = base_path + "/phoenix2014T.dev.sign" tgt_file = base_path + "/phoenix2014T.dev.de" tgt_vocab_table = create_tgt_vocab_table(base_path + "/phoenix2014T.vocab.de") dataset = dataset.get_train_dataset(src_file, tgt_file, tgt_vocab_table) cnt = 0 for data in dataset.take(1): cnt += 1 src_inputs, tgt_in, tgt_out, src_path, src_len, tgt_len = data bs, t, h, w, c = src_inputs.shape print(src_inputs.shape, src_path) src_inputs = tf.reshape(src_inputs, (bs * t, h, w, c)) cnn_output = resnet_model(src_inputs, training=False) cnn_output = tf.reshape(cnn_output, (bs, t, -1)) attention_out, atten_weights, atten_logits = scaled_dot_product_attention( cnn_output, cnn_output, cnn_output, mask=None) for i in range(100): # print(atten_logits[0, i, :]) print(tf.nn.top_k(atten_logits[0, i, :], k=10).indices)
def main_worker(local_rank, nprocs, args): args.local_rank = local_rank init_seeds(local_rank+1) # 获得init_method的通信端口 init_method = 'tcp://' + args.ip + ':' + args.port # 1. 分布式初始化,对于每一个进程都需要进行初始化,所以定义在 main_worker中 cudnn.benchmark = True dist.init_process_group(backend='nccl', init_method=init_method, world_size=args.nprocs, rank=local_rank) # 2. 基本定义,模型-损失函数-优化器 model = resnet18() torch.cuda.set_device(local_rank) model.cuda(local_rank) criterion = nn.CrossEntropyLoss().cuda(local_rank) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=1e-4) train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2) # apex初始化 model = apex.parallel.convert_syncbn_model(model).to(local_rank) # 使用 apex 提供的 SyncBatchNorm 操作 model, optimizer = amp.initialize(model, optimizer) model = DDP(model) # 3. 加载数据, batch_size = int(args.batch_size / nprocs) train_dataset = get_train_dataset() train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=train_sampler) test_dataset = get_test_dataset() test_sampler = torch.utils.data.distributed.DistributedSampler(test_dataset) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=test_sampler) for epoch in range(args.epochs): start = time.time() model.train() train_sampler.set_epoch(epoch) train_scheduler.step(epoch) for step, (images, labels) in enumerate(train_loader): # 将对应进程的数据放到对应 GPU 上 images = images.cuda(local_rank, non_blocking=True) labels = labels.cuda(local_rank, non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) torch.distributed.barrier() reduced_loss = reduce_mean(loss, args.nprocs) # 更新优化模型权重, 用scale_loss修饰loss optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() if args.local_rank == 0: print( 'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format( reduced_loss, optimizer.param_groups[0]['lr'], epoch=epoch+1, trained_samples=step * args.batch_size + len(images), total_samples=len(train_loader.dataset) )) finish = time.time() if args.local_rank == 0: print('epoch {} training time consumed: {:.2f}s'.format(epoch, finish - start)) # validate after every epoch validate(test_loader, model, criterion, local_rank, args)
def main_worker(local_rank, nprocs, args): args.local_rank = local_rank init_seeds(local_rank + 1) # set different seed for each worker # 获得init_method的通信端口 init_method = 'tcp://' + args.ip + ':' + args.port # 1. 分布式初始化,对于每一个进程都需要进行初始化,所以定义在 main_worker中 cudnn.benchmark = True dist.init_process_group(backend='nccl', init_method=init_method, world_size=args.nprocs, rank=local_rank) # 2. 基本定义,模型-损失函数-优化器 model = resnet18( ) # 定义模型,将对应进程放到对应的GPU上, .cuda(local_rank) / .set_device(local_rank) # 以下是需要加 local_rank 的部分:模型 # ================================ torch.cuda.set_device(local_rank) # 使用 set_device 和 cuda 来指定需要的 GPU model.cuda(local_rank) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(local_rank) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank]) # 将模型用 DistributedDataParallel 包裹 # ================================= criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=1e-4) train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2) # 3. 加载数据, batch_size = int(args.batch_size / nprocs) # 需要手动划分 batch_size 为 mini-batch_size train_dataset = get_train_dataset() train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=train_sampler) test_dataset = get_test_dataset() test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True, sampler=test_sampler) for epoch in range(args.epochs): start = time.time() model.train() # 需要设置sampler的epoch为当前epoch来保证dataloader的shuffle的有效性 train_sampler.set_epoch(epoch) # 设置 train_scheduler 来调整学习率 train_scheduler.step(epoch) for step, (images, labels) in enumerate(train_loader): # 将对应进程的数据放到 GPU 上 images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) # torch.distributed.barrier()的作用是,阻塞进程,保证每个进程运行完这一行代码之前的所有代码,才能继续执行,这样才计算平均loss和平均acc的时候不会出现因为进程执行速度不一致的错误 torch.distributed.barrier() reduced_loss = reduce_mean(loss, args.nprocs) # 更新优化模型权重 optimizer.zero_grad() loss.backward() optimizer.step() if args.local_rank == 0: print( 'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}' .format(reduced_loss, optimizer.param_groups[0]['lr'], epoch=epoch + 1, trained_samples=step * args.batch_size + len(images), total_samples=len(train_loader.dataset))) finish = time.time() if args.local_rank == 0: print('epoch {} training time consumed: {:.2f}s'.format( epoch, finish - start)) # validate after every epoch validate(test_loader, model, criterion, local_rank, args)