def run(self, task): """ Run GLUE training / evaluation. """ params = self.params # task parameters self.task = task params.out_features = N_CLASSES[task] self.is_classif = task != 'STS-B' # load data self.data = self.load_data(task) if not self.data['dico'] == self._embedder.dico: raise Exception(( "Dictionary in evaluation data (%i words) seems different than the one " + "in the pretrained model (%i words). Please verify you used the same dictionary, " + "and the same values for max_vocab and min_count.") % (len(self.data['dico']), len(self._embedder.dico))) # embedder self.embedder = copy.deepcopy(self._embedder) self.embedder.cuda() # projection layer self.proj = nn.Sequential(*[ nn.Dropout(params.dropout), nn.Linear(self.embedder.out_dim, params.out_features) ]).cuda() # float16 if params.fp16: assert torch.backends.cudnn.enabled self.embedder.model = network_to_half(self.embedder.model) self.proj = network_to_half(self.proj) # optimizer self.optimizer = get_optimizer( list(self.embedder.get_parameters(params.finetune_layers)) + list(self.proj.parameters()), params.optimizer) if params.fp16: self.optimizer = FP16_Optimizer(self.optimizer, dynamic_loss_scale=True) # train and evaluate the model for epoch in range(params.n_epochs): # update epoch self.epoch = epoch # training logger.info("GLUE - %s - Training epoch %i ..." % (task, epoch)) self.train() # evaluation logger.info("GLUE - %s - Evaluating epoch %i ..." % (task, epoch)) with torch.no_grad(): scores = self.eval() self.scores.update(scores)
def create_optimizers(model, args, lr_schedule, prev_optimizer=None, prev_scheduler=None): params = [p for p in model.parameters() if p.requires_grad] optimizer = FusedAdam(params, lr=args.lr) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True, verbose=False) if prev_optimizer is not None: optimizer.load_state_dict(prev_optimizer.state_dict()) if args.warmup < 0: print('No learning rate schedule used.') else: print('Using learning rate schedule.') scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer.optimizer, lr_schedule) if prev_scheduler is not None: # Continue LR schedule from previous scheduler scheduler.load_state_dict(prev_scheduler.state_dict()) loss_model = SimpleDistributedDataParallel(model, args.world_size) return loss_model, optimizer, scheduler if args.warmup > 0 else None
def __init__(self, optimizer_class, optimizer_kwargs, criterion, is_fp16=False): super(Model, self).__init__() self._optimizer_kwargs = optimizer_kwargs self._device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info("Model device: {}".format(self.device)) self._model = self.native_model_factory() self._is_fp16 = is_fp16 if self._is_fp16: # Lazy load apex framework # noinspection PyUnresolvedReferences from apex.fp16_utils import network_to_half, FP16_Optimizer self._model = network_to_half(self._model) self._gpu_count = torch.cuda.device_count() if torch.cuda.is_available() else 0 logger.info("GPU count: {}".format(self._gpu_count)) self._model = DataParallel(self._model) self._model = self._model.to(self.device) self._criterion = criterion.to(self.device) self._optimizer = optimizer_class(self._model.parameters(), **optimizer_kwargs) if self._is_fp16: self._optimizer = FP16_Optimizer(self._optimizer)
def build_optimizer(model): optim_map = { "rmsprop": optim.RMSprop, "adam": optim.Adam, "adamnorm": AdamNormGrad, "adamw": AdamW, "adadelta": optim.Adadelta, "sgd": optim.SGD, "sgd_momentum": lambda params, lr: optim.SGD( params, lr=lr, weight_decay=1e-4, momentum=0.9), "lbfgs": optim.LBFGS } # filt = filter(lambda p: p.requires_grad, model.parameters()) # return optim_map[args.optimizer.lower().strip()](filt, lr=args.lr) optimizer = optim_map[args.optimizer.lower().strip()](model.parameters(), lr=args.lr) if args.half is True: return FP16_Optimizer(optimizer, dynamic_loss_scale=True) return optimizer
def get_optimizer_fp(self, module): """ Build optimizer. """ assert module in ['model', 'encoder', 'decoder'] optimizer = get_optimizer(getattr(self, module).parameters(), self.params.optimizer) if self.params.fp16: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) return optimizer
def get_combo_optimizer_fp(self, modules): """ Build optimizer. """ assert isinstance(modules, tuple) param_groups = [] for module in modules: assert hasattr(self, module) param_groups.extend(getattr(self, module).parameters()) optimizer = get_optimizer(param_groups, self.params.optimizer) if self.params.fp16: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) return optimizer
def set_parameters(self, params): """ ? """ self.params = [] self.sparse_params = [] for k, p in params: if p.requires_grad: if self.method != 'sparseadam' or "embed" not in k: self.params.append(p) else: self.sparse_params.append(p) if self.method == 'sgd': self.optimizer = optim.SGD(self.params, lr=self.learning_rate, momentum=self.momentum) elif self.method == 'adagrad': self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate) for group in self.optimizer.param_groups: for p in group['params']: self.optimizer.state[p]['sum'] = self.optimizer\ .state[p]['sum'].fill_(self.adagrad_accum) elif self.method == 'adadelta': self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate) elif self.method == 'adam': if self.fp16: from apex.optimizers import FusedAdam self.optimizer = FusedAdam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-9) else: self.optimizer = optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-9) elif self.method == 'sparseadam': self.optimizer = MultipleOptimizer([ optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-8), optim.SparseAdam(self.sparse_params, lr=self.learning_rate, betas=self.betas, eps=1e-8) ]) else: raise RuntimeError("Invalid optim method: " + self.method) if self.fp16: from apex.fp16_utils import FP16_Optimizer self.optimizer = FP16_Optimizer(self.optimizer, dynamic_loss_scale=True)
def set_optimizer(self): """ Set optimizer parameters """ if not self.model_conf.learn_center: if self.conf.optim == 'SGD': self.optimizer = getattr(optim, 'SGD')( filter(lambda p: p.requires_grad, self.net.parameters()), lr=self.conf.lr_init, momentum=0.9, nesterov=True, weight_decay=self.conf.w_decay) # default SGD else: self.optimizer = getattr(optim, self.conf.optim)( filter(lambda p: p.requires_grad, self.net.parameters()), lr=self.conf.lr_init, weight_decay=self.conf.w_decay) # default SGD else: # Learn center params_model = [] params_center = [] for n, p in self.net.named_parameters(): if 'centers' in n and p.requires_grad: # TODO: check if classifier is also better if separated params_center.append(p) elif p.requires_grad: params_model.append(p) if self.conf.optim == 'SGD': self.optimizer = getattr(optim, 'SGD')(params_model, lr=self.conf.lr_init, momentum=0.9, nesterov=True, weight_decay=self.conf.w_decay) self.optimizer.add_param_group({'params': params_center}) else: self.optimizer = getattr(optim, self.conf.optim)( params_model, lr=self.conf.lr_init, weight_decay=self.conf.w_decay) self.optimizer.add_param_group({ 'params': params_center }) # Other settings are same as the first group by default if self.model_conf.use_fp16: self.optimizer = FP16_Optimizer(self.optimizer) if self.conf.res: if self.tp.get_meta('optim') == self.conf.optim: if 'optim_state' in self.tp.meta_dict.keys(): self.optimizer.load_state_dict( self.tp.get_meta('optim_state')) print('Optimizer Internal State Restored')
def build_optimizer(parameters): if params.fp16: # Use apex's FP 16 optimizer for mixed precison and to do loss scaling optimizer = FP16_Optimizer(AdamCosineWithWarmup(parameters, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.01), dynamic_loss_scale=True) else: optimizer = AdamCosineWithWarmup(parameters, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.01) return optimizer
def init_schedule(config, optimizer, train_loader): t_total = len(train_loader) * config.epochs warmup_steps = t_total * config.warmup_ratio if switch: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) scheduler = get_linear_schedule_with_warmup( optimizer.optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) else: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) return scheduler
def run(self): """ Run XNLI training / evaluation. """ params = self.params # load data self.data = self.load_data() assert len(self.data['dico']) == self._embedder.n_words # embedder self.embedder = copy.deepcopy(self._embedder) self.embedder.cuda() # projection layer self.proj = nn.Sequential( *[nn.Dropout(params.dropout), nn.Linear(self.embedder.out_dim, 3)]).cuda() # float16 if params.fp16: assert torch.backends.cudnn.enabled self.embedder.model = network_to_half(self.embedder.model) self.proj = network_to_half(self.proj) # optimizer self.optimizer = get_optimizer( list(self.embedder.get_parameters(params.finetune_layers)) + list(self.proj.parameters()), params.optimizer) if params.fp16: self.optimizer = FP16_Optimizer(self.optimizer, dynamic_loss_scale=True) # train and evaluate the model for epoch in range(params.n_epochs): # update epoch self.epoch = epoch # training logger.info("XNLI - Training epoch %i ..." % epoch) self.train() # evaluation logger.info("XNLI - Evaluating epoch %i ..." % epoch) with torch.no_grad(): scores = self.eval() self.scores.update(scores)
def adjust_learning_rate(epoch, pack): if pack.optimizer is None: if cfg.train.optim == 'sgd' or cfg.train.optim is None: pack.optimizer = optim.SGD(pack.net.parameters(), lr=1, momentum=cfg.train.momentum, weight_decay=cfg.train.weight_decay, nesterov=cfg.train.nesterov) else: print('WRONG OPTIM SETTING!') assert False pack.lr_scheduler = optim.lr_scheduler.LambdaLR( pack.optimizer, get_lr_func()) if cfg.base.fp16 and cfg.base.cuda: from apex.fp16_utils import FP16_Optimizer pack.optimizer = FP16_Optimizer(pack.optimizer, dynamic_loss_scale=True) pack.lr_scheduler.step(epoch) return pack.lr_scheduler.get_lr()
def __get_opimizer(self): weight_decay = self.args.train['weight_decay'] if self.args.train['optimizer'] == 'SGD': optimizer = optim.SGD(self.net.parameters(), lr=self.getLR(0), momentum=0.9, weight_decay=weight_decay) elif self.args.train['optimizer'] == 'Adam': if self.half: optimizer = FusedAdam(self.net.parameters(), lr=self.getLR(0)) else: optimizer = optim.Adam(self.net.parameters(), lr=self.getLR(0)) elif self.args.train['optimizer'] == 'AdamW': if self.half: optimizer = FusedAdam(self.net.parameters(), lr=self.getLR(0), adam_w_mode=True) else: optimizer = optim.AdamW(self.net.parameters(), lr=self.getLR(0)) if self.half: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True, dynamic_loss_args={'scale_factor' : 3}) optimizer.loss_scale = 512 if self.args.train['LA'] == True: optimizer = Lookahead(optimizer,k=5,alpha=0.5) return optimizer
def create_supervised_trainer( model: torch.nn.Module, optimizer: torch.optim.Optimizer, loss_fn: torch.nn.Module, max_norm: float = None, norm_type: int = 2, device: torch.cuda.device = None, non_blocking: bool = False, mixed_precision: bool = False, static_loss_scale: int = 512, prepare_batch: Callable = _prepare_batch) -> Engine: if device: model.to(device) if mixed_precision: optimizer = FP16_Optimizer(optimizer, static_loss_scale=static_loss_scale) def _process_function(engine: Engine, batch): model.train() optimizer.zero_grad() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) loss = loss_fn(y_pred, y) if mixed_precision: optimizer.backward(loss) else: loss.backward() if max_norm: clip_grad_norm_(model.parameters(), max_norm, norm_type) optimizer.step() return loss.item() return Engine(_process_function)
def main(**kwargs): args = DefaultConfig() args.parse(kwargs) if not torch.cuda.is_available(): args.cuda = False args.device = None torch.manual_seed(args.seed) # set random seed for cpu train_iter, val_iter, test_iter, args.vocab_size, vectors = data.load_data( args) global best_score # init model model = getattr(models, args.model)(args, vectors) # 模型保存位置 if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) save_path = os.path.join(args.save_dir, '{}_{}.pth'.format(args.model, args.id)) if args.cuda: # torch.cuda.set_device(args.device) torch.cuda.manual_seed(args.seed) # set random seed for gpu model.cuda() # 目标函数和优化器 criterion = F.cross_entropy lr1, lr2 = args.lr1, args.lr2 # optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9) optimizer = torch.optim.Adam(model.parameters(), lr=lr1, betas=(0.9, 0.99)) # optimizer = model.get_optimizer(lr1, lr2, args.weight_decay) optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0) for i in range(args.max_epochs): total_loss = 0.0 correct = 0 total = 0 model.train() for idx, batch in enumerate(train_iter): # 训练模型参数 # 使用BatchNorm层时,batch size不能为1 if len(batch) == 1: continue text, label = batch.text, batch.label if args.cuda: text, label = text.cuda(), label.cuda() optimizer.zero_grad() pred = model(text) loss = criterion(pred, label) #loss.backward() optimizer.backward(loss) optimizer.step() # 更新统计指标 total_loss += float(loss.item()) predicted = pred.max(1)[1] total += label.size(0) correct += predicted.eq(label).sum().item() if idx % 80 == 79: print('[{}, {}] loss: {:.3f} | Acc: {:.3f}%({}/{})'.format( i + 1, idx + 1, total_loss / 20, 100. * correct / total, correct, total)) total_loss = 0.0 # 计算再验证集上的分数,并相应调整学习率 f1score = val(model, val_iter, args) if f1score > best_score: best_score = f1score checkpoint = {'state_dict': model.state_dict(), 'config': args} torch.save(checkpoint, save_path) print('Best tmp model f1score: {}'.format(best_score)) if f1score < best_score: #model.load_state_dict(torch.load(save_path)['state_dict'],map_location={'cuda:1':'cuda:0'}) model.load_state_dict(torch.load(save_path)['state_dict']) lr1 *= args.lr_decay lr2 = 2e-4 if lr2 == 0 else lr2 * 0.8 #optimizer = model.get_optimizer(lr1, lr2, 0) optimizer = torch.optim.Adam(model.parameters(), lr=lr1, betas=(0.9, 0.99)) print('* load previous best model: {}'.format(best_score)) print('* model lr:{} emb lr:{}'.format(lr1, lr2)) if lr1 < args.min_lr: print('* training over, best f1 score: {}'.format(best_score)) break # 保存训练最终的模型 args.best_score = best_score final_model = {'state_dict': model.state_dict(), 'config': args} best_model_path = os.path.join( args.save_dir, '{}_{}_{}.pth'.format(args.model, args.text_type, best_score)) torch.save(final_model, best_model_path) print('Best Final Model saved in {}'.format(best_model_path)) # 在测试集上运行模型并生成概率结果和提交结果 if not os.path.exists('result/'): os.mkdir('result/') probs, test_pred = test(model, test_iter, args) result_path = 'result/' + '{}_{}_{}'.format(args.model, args.id, args.best_score) np.save('{}.npy'.format(result_path), probs) print('Prob result {}.npy saved!'.format(result_path)) test_pred[['id', 'class']].to_csv('{}.csv'.format(result_path), index=None) print('Result {}.csv saved!'.format(result_path)) t2 = time.time() print('time use: {}'.format(t2 - t1))
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.timed_block_start("run") LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS, metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE) LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_items/sec", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_avg_loss", metric_scope=dllg.EPOCH_SCOPE) log_hardware() model_name = args.model_name parser = models.parse_model_args(model_name, parser) parser.parse_args() args = parser.parse_args() log_args(args) torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark distributed_run = args.world_size > 1 if distributed_run: init_distributed(args, args.world_size, args.rank, args.group_name) LOGGER.log(key=tags.RUN_START) run_start_time = time.time() model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, to_fp16=args.fp16_run, to_cuda=True) epoch_start = 0 if args.resume: resume_model_path = args.resume_tacotron2_path if args.model_name == "Tacotron2" else args.resume_waveglow_path checkpoint = torch.load(resume_model_path, map_location='cpu') epoch_start = checkpoint["epoch"] state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) print("restore model %s" % resume_model_path) if distributed_run: model = DDP(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) if args.fp16_run: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=args.dynamic_loss_scaling) try: sigma = args.sigma except AttributeError: sigma = None criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None collate_fn = data_functions.get_collate_function(model_name, n_frames_per_step) trainset = data_functions.get_data_loader(model_name, args.dataset_path, args.training_files, args) train_sampler = DistributedSampler(trainset) if distributed_run else None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader(model_name, args.dataset_path, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 model.train() LOGGER.log(key=tags.TRAIN_LOOP) for epoch in range(epoch_start, args.epochs): LOGGER.epoch_start() epoch_start_time = time.time() LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 # used to calculate avg loss over epoch train_epoch_avg_loss = 0.0 num_iters = 0 # if overflow at the last iteration then do not save checkpoint overflow = False for i, batch in enumerate(train_loader): LOGGER.iteration_start() iter_start_time = time.time() LOGGER.log(key=tags.TRAIN_ITER_START, value=i) print("Batch: {}/{} epoch {}".format(i, len(train_loader), epoch)) start = time.perf_counter() adjust_learning_rate(epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor) model.zero_grad() x, y, num_items = batch_to_gpu(batch) if args.fp16_run: y_pred = model(fp32_to_fp16(x)) loss = criterion(fp16_to_fp32(y_pred), y) else: y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, args.world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss) train_epoch_avg_loss += reduced_loss num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.fp16_run: optimizer.backward(loss) grad_norm = optimizer.clip_master_grads(args.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() overflow = optimizer.overflow if args.fp16_run else False iteration += 1 LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i) iter_stop_time = time.time() iter_time = iter_stop_time - iter_start_time LOGGER.log(key="train_iter_items/sec", value=(reduced_num_items / iter_time)) LOGGER.log(key="iter_time", value=iter_time) LOGGER.iteration_stop() LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) epoch_stop_time = time.time() epoch_time = epoch_stop_time - epoch_start_time LOGGER.log(key="train_epoch_items/sec", value=(reduced_num_items_epoch / epoch_time)) LOGGER.log(key="train_epoch_avg_loss", value=(train_epoch_avg_loss / num_iters if num_iters > 0 else 0.0)) LOGGER.log(key="epoch_time", value=epoch_time) LOGGER.log(key=tags.EVAL_START, value=epoch) validate(model, criterion, valset, iteration, args.batch_size, args.world_size, collate_fn, distributed_run, args.rank, batch_to_gpu, args.fp16_run) LOGGER.log(key=tags.EVAL_STOP, value=epoch) if not overflow and (epoch % args.epochs_per_checkpoint == 0) and args.rank == 0: checkpoint_path = os.path.join( args.output_directory, "checkpoint_{}_{}".format(model_name, epoch)) save_checkpoint(model, epoch, model_config, checkpoint_path) save_sample( model_name, model, args.waveglow_checkpoint, args.tacotron2_checkpoint, args.phrase_path, os.path.join(args.output_directory, "sample_{}_{}.wav".format(model_name, iteration)), args.sampling_rate, args.fp16_run) LOGGER.epoch_stop() run_stop_time = time.time() run_time = run_stop_time - run_start_time LOGGER.log(key="run_time", value=run_time) LOGGER.log(key=tags.RUN_FINAL) print("training time", run_stop_time - run_start_time) LOGGER.timed_block_stop("run") if args.rank == 0: LOGGER.finish()
model.total_parameters() model.initialize_weights_xavier_uniform() model = network_to_half(model) model = model.cuda() model.load_state_dict(torch.load("CARN_model_checkpoint.pt")) learning_rate = 1e-4 weight_decay = 1e-6 optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay, amsgrad=True) # optimizer = optim.SGD(model.parameters(), momentum=0.9, nesterov=True, weight_decay=weight_decay, lr=learning_rate) optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0, verbose=False) # optimizer.load_state_dict(torch.load("CARN_adam_checkpoint.pt")) last_iter = -1 # torch.load("CARN_scheduler_last_iter") scheduler = CyclicLR(optimizer.optimizer, base_lr=1e-4, max_lr=1e-4, step_size=3 * total_batch, mode="triangular", last_batch_iteration=last_iter) train_loss = [] train_ssim = [] train_psnr = [] test_loss = [] test_ssim = []
torch.backends.cudnn.benchmark = True N, D_in, D_out = 64, 1024, 16 x = Variable(torch.cuda.FloatTensor(N, D_in ).normal_()).half() y = Variable(torch.cuda.FloatTensor(N, D_out).normal_()).half() model = torch.nn.Linear(D_in, D_out).cuda().half() model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) ### CONSTRUCT FP16_Optimizer ### optimizer = FP16_Optimizer(optimizer) ### loss_fn = torch.nn.MSELoss() for t in range(500): optimizer.zero_grad() y_pred = model(x) loss = loss_fn(y_pred, y) ### CHANGE loss.backward() TO: ### optimizer.backward(loss) ### optimizer.step() print("final loss = ", loss)
def train_network(net, model_ckpt, fold=0): # train the network, allow for keyboard interrupt try: # define optimizer # optimizer = optim.SGD(net.parameters(), lr=config.lr, momentum=0.9, weight_decay=configs.l2) if config.fp16: optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=config.lr, eps=1e-04) from apex.fp16_utils import FP16_Optimizer optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True, verbose=False) else: optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=config.lr) valid_patience = 0 best_val_loss = None best_val_f1 = None cycle = 0 t_ = 0 if args.resume: net, optimizer, start_epoch, best_val_loss = load_checkpoint( net, optimizer, model_ckpt) if config.reduce_lr_plateau: scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', config.lr_scale, config.lr_patience, True) if config.cosine_annealing: cos_lr, cycle_ends = cosine_annealing_lr(config.min_lr, config.max_lr, config.cycle_size, config.epochs, config.cycle_size_inc) # get the loaders train_loader, valid_loader = get_data_loaders( imsize=config.imsize, num_channels=config.num_channels, batch_size=config.batch_size, test_size=config.test_size, num_workers=config.num_workers, preload=config.preload_data, external_data=config.external_data, mixup=config.mixup) # loss = F1Loss() if hasattr(config, 'focal_gamma'): loss = FocalLoss(config.focal_gamma) else: loss = FocalLoss() # loss = nn.BCEWithLogitsLoss().cuda() # if hasattr(config, 'focal_gamma'): # loss = FocalTverskyLoss(gamma = config.focal_gamma) # else: # loss = FocalTverskyLoss() # training flags freeze_bn = False save_imgs = False train_losses = [] valid_losses = [] valid_f1s = [] lr_hist = [] print('Training ...') print('Saving to ', model_ckpt) for e in range(config.epochs): print('\n' + 'Epoch {}/{}'.format(e, config.epochs)) start = time.time() t_l = train(net, optimizer, loss, train_loader, freeze_bn) v_l, v_f1 = valid(net, optimizer, loss, valid_loader, save_imgs, fold) if config.reduce_lr_plateau: scheduler.step(v_l) if config.cosine_annealing: for param_group in optimizer.param_groups: param_group['lr'] = cos_lr[e] if (e in cycle_ends): cycle = np.where(cycle_ends == e)[0][0] + 1 net.eval() torch.save( net.state_dict(), model_ckpt.replace('best', 'cycle{}'.format(cycle))) print("Cycle {} completed. Saving model to {}".format( cycle, model_ckpt.replace('best', 'cycle{}'.format(cycle)))) lr_hist.append(optimizer.param_groups[0]['lr']) state = { 'epoch': e, 'arch': config.model_name, 'state_dict': net.state_dict(), 'best_val_loss': best_val_loss, 'optimizer': optimizer.state_dict(), } # save the model on best validation loss if best_val_loss is None or v_l < best_val_loss: best_val_loss = v_l net.eval() torch.save(state, model_ckpt) valid_patience = 0 print('Best val loss achieved. loss = {:.4f}.'.format(v_l), " Saving model to ", model_ckpt) # save the model on best validation f1 # if best_val_f1 is None or v_f1 > best_val_f1: # net.eval() # torch.save(net.state_dict(), model_ckpt.replace('best', 'bestf1')) # best_val_f1 = v_f1 # valid_patience = 0 # print('Best val F1 achieved. F1 = {:.4f}.'. # format(v_f1), " Saving model to ", model_ckpt.replace('best', 'bestf1')) # if (e > 5): # SUBM_OUT = './subm/{}_{}_epoch{}.csv'.format( # config.model_name, config.exp_name, str(e)) # generate_submission(net, config, SUBM_OUT) else: valid_patience += 1 torch.save(state, model_ckpt.replace('best', 'latest')) train_losses.append(t_l) valid_losses.append(v_l) valid_f1s.append(v_f1) log_metrics(train_losses, valid_losses, valid_f1s, lr_hist, e, model_ckpt, config) t_ += 1 print('Time: {:d}s'.format(int(time.time() - start))) except KeyboardInterrupt: pass gen_sub = input( "\n\nGenerate submission while the GPU is still hot from training? [Y/n]: " ) if gen_sub in ['Y', 'y', 'Yes', 'yes']: generate_submission(net, config)
def initialize(self, opt): BaseModel.initialize(self, opt) if opt.half: try: from apex.fp16_utils import FP16_Optimizer except ImportError: print("Please install NVIDIA Apex for safe mixed precision") # specify the training losses you want to print out. The program will call base_model.get_current_losses self.loss_names = ['D_A', 'G_A', 'cycle_A', 'idt_A', 'D_B', 'G_B', 'cycle_B', 'idt_B'] # specify the images you want to save/display. The program will call base_model.get_current_visuals visual_names_A = ['real_A', 'fake_B', 'rec_A'] visual_names_B = ['real_B', 'fake_A', 'rec_B'] if self.isTrain and self.opt.lambda_identity > 0.0: visual_names_A.append('idt_A') visual_names_B.append('idt_B') self.visual_names = visual_names_A + visual_names_B # specify the models you want to save to the disk. The program will call base_model.save_networks and base_model.load_networks if self.isTrain: self.model_names = ['G_A', 'G_B', 'D_A', 'D_B'] else: # during test time, only load Gs self.model_names = ['G_A', 'G_B'] # load/define networks # The naming conversion is different from those used in the paper # Code (paper): G_A (G), G_B (F), D_A (D_Y), D_B (D_X) self.netG_A = networks.define_G(opt.input_nc, opt.output_nc, opt.ngf, opt.netG, opt.norm, not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids) self.netG_B = networks.define_G(opt.output_nc, opt.input_nc, opt.ngf, opt.netG, opt.norm, not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids) if opt.half: self.netG_A = self.netG_A.half() self.netG_B = self.netG_B.half() if self.isTrain: use_sigmoid = opt.no_lsgan self.netD_A = networks.define_D(opt.output_nc, opt.ndf, opt.netD, opt.n_layers_D, opt.norm, use_sigmoid, opt.init_type, opt.init_gain, self.gpu_ids) self.netD_B = networks.define_D(opt.input_nc, opt.ndf, opt.netD, opt.n_layers_D, opt.norm, use_sigmoid, opt.init_type, opt.init_gain, self.gpu_ids) self.fake_A_pool = ImagePool(opt.pool_size) self.fake_B_pool = ImagePool(opt.pool_size) # define loss functions self.criterionGAN = networks.GANLoss(use_lsgan=not opt.no_lsgan, half_precision=opt.half).to(self.device) self.criterionCycle = torch.nn.L1Loss() self.criterionIdt = torch.nn.L1Loss() # initialize optimizers self.optimizer_G = torch.optim.Adam(itertools.chain(self.netG_A.parameters(), self.netG_B.parameters()), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizer_D = torch.optim.Adam(itertools.chain(self.netD_A.parameters(), self.netD_B.parameters()), lr=opt.lr, betas=(opt.beta1, 0.999)) if opt.half: self.netD_A = self.netD_A.half() self.netD_B = self.netD_B.half() self.optimizer_G = FP16_Optimizer(self.optimizer_G, dynamic_loss_scale=True) self.optimizer_D = FP16_Optimizer(self.optimizer_D, dynamic_loss_scale=True) self.optimizers = [] self.optimizers.append(self.optimizer_G) self.optimizers.append(self.optimizer_D)
def init_model(self): n_layer = int( (self.depth - 1) / 2) # depth = n_layer * (multi-head + ffn) + linear-softmax d_model = self.width d_inner = self.width * 2 vocab_size = self.vocab tgt_len = self.bptt_len if self.d_embed < 0: self.d_embed = d_model # Mixed-floating point precision (if fp16 is enabled, storage will be with half-precision) if self.fp16 and 'cuda' not in self.device: print('WARNING: fp16 requires cuda, ignoring fp16 option') self.fp16 = False elif self.fp16: try: from apex.fp16_utils import FP16_Optimizer self.optimizer = FP16_Optimizer( self.optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={'init_scale': 2**16}) except: print('WARNING: apex not installed, ignoring fp16 option') self.fp16 = False if self.restart: with open(os.path.join(restart_dir, 'model.pt'), 'rb') as f: model = torch.load(f) if not fp16: model = model.float() model.apply(self.update_dropout) model.apply(self.update_dropatt) else: model = MemTransformerLM(vocab_size, n_layer, self.n_head, d_model, self.d_head, d_inner, self.dropout, self.dropatt, tie_weight=self.tied, d_embed=self.d_embed, div_val=self.div_val, tie_projs=[False], pre_lnorm=self.pre_lnorm, tgt_len=self.tgt_len, ext_len=self.ext_len, mem_len=self.mem_len, cutoffs=[], same_length=self.same_length, attn_type=self.attn_type, clamp_len=self.clamp_len, sample_softmax=-1) model.apply(self.weights_init) model.word_emb.apply( self.weights_init ) # ensure embedding init is not overridden by out_layer in case of weight sharing self.model = model self.n_all_param = sum([p.nelement() for p in model.parameters()]) self.n_nonemb_param = sum( [p.nelement() for p in model.layers.parameters()]) if self.multi_gpu: self.model = self.model.to(self.device) if self.gpu0_bsz >= 0: self.para_model = BalancedDataParallel(self.gpu0_bsz, self.model, dim=1).to(self.device) else: self.para_model = nn.DataParallel(self.model, dim=1).to(self.device) else: self.para_model = self.model.to(self.device) return model
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=100, metavar='N', help='input batch size for training (default: 100)') parser.add_argument('--test-batch-size', type=int, default=100, metavar='N', help='input batch size for testing (default: 100)') parser.add_argument('--epochs', type=int, default=100, metavar='N', help='number of epochs to train (default: 100)') parser.add_argument('--lr', type=float, default=0.1, metavar='LR', help='learning rate (default: 0.1)') parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='SGD momentum (default: 0.9)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=17, metavar='S', help='random seed (default: 17)') parser.add_argument( '--log-interval', type=int, default=100, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', type=str, default='./models/model.pt', help='For Saving the current Model') parser.add_argument('--data', type=str, default='mnist') parser.add_argument('--augment', action='store_true') parser.add_argument('--decay_frequency', type=int, default=25000) parser.add_argument('--l1', type=float, default=0.0) parser.add_argument('--fp16', action='store_true', help='Run in fp16 mode.') parser.add_argument('--valid_split', type=float, default=0.1) parser.add_argument('--resume', type=str) parser.add_argument('--start-epoch', type=int, default=1) parser.add_argument('--model', type=str, default='') parser.add_argument('--l2', type=float, default=5.0e-4) parser.add_argument( '--iterations', type=int, default=1, help= 'How many times the model should be run after each other. Default=1') parser.add_argument( '--save-features', action='store_true', help= 'Resumes a saved model and saves its feature data to disk for plotting.' ) parser.add_argument( '--bench', action='store_true', help='Enables the benchmarking of layers and estimates sparse speedups' ) sparselearning.core.add_sparse_args(parser) args = parser.parse_args() if args.fp16: try: from apex.fp16_utils import FP16_Optimizer except: print('WARNING: apex not installed, ignoring --fp16 option') args.fp16 = False use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print_and_log('\n\n') print_and_log('=' * 80) print_and_log('=' * 80) print_and_log(args) torch.manual_seed(args.seed) for i in range(args.iterations): print_and_log("\nIteration start: {0}/{1}\n".format( i + 1, args.iterations)) if args.data == 'mnist': train_loader, valid_loader, test_loader = get_mnist_dataloaders( args, validation_split=args.valid_split) else: train_loader, valid_loader, test_loader = get_cifar10_dataloaders( args, args.valid_split) if args.model not in models: print( 'You need to select an existing model via the --model argument. Available models include: ' ) for key in models: print('\t{0}'.format(key)) raise Exception('You need to select a model') else: cls, cls_args = models[args.model] cls_args.append(args.save_features) cls_args.append(args.bench) model = cls(*cls_args).to(device) print_and_log(model) print_and_log('=' * 60) print_and_log(args.model) print_and_log('=' * 60) print_and_log('=' * 60) print_and_log('Death mode: {0}'.format(args.death)) print_and_log('Growth mode: {0}'.format(args.growth)) print_and_log('Redistribution mode: {0}'.format( args.redistribution)) print_and_log('=' * 60) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2, nesterov=True) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, args.decay_frequency, gamma=0.1) if args.resume: if os.path.isfile(args.resume): print_and_log("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print_and_log("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) print_and_log('Testing...') evaluate(args, model, device, test_loader) plot_class_feature_histograms(args, model, device, train_loader, optimizer) else: print_and_log("=> no checkpoint found at '{}'".format( args.resume)) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=None, dynamic_loss_scale=True, dynamic_loss_args={'init_scale': 2**16}) model = model.half() mask = None if args.sparse: decay = CosineDecay(args.death_rate, len(train_loader) * (args.epochs)) mask = Masking(optimizer, death_mode=args.death, death_rate_decay=decay, growth_mode=args.growth, redistribution_mode=args.redistribution) mask.add_module(model, density=args.density) for epoch in range(1, args.epochs + 1): t0 = time.time() train(args, model, device, train_loader, optimizer, epoch, lr_scheduler, mask) if args.valid_split > 0.0: val_acc = evaluate(args, model, device, valid_loader) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, is_best=False, filename=args.save_model) if args.sparse and epoch < args.epochs: mask.at_end_of_epoch() print_and_log( 'Current learning rate: {0}. Time taken for epoch: {1}.\n'. format(optimizer.param_groups[0]['lr'], time.time() - t0)) evaluate(args, model, device, test_loader) print_and_log("\nIteration end: {0}/{1}\n".format( i + 1, args.iterations))
def main(): global args, best_prec1 args = parser.parse_args() setup_logger(args) if args.fp16: try: from apex.fp16_utils import FP16_Optimizer except: print_and_log( 'WARNING: apex not installed, ignoring --fp16 option') args.fp16 = False kwargs = {'num_workers': 1, 'pin_memory': True} dataset = args.model.split('_')[0] if dataset == 'mnist': full_dataset = datasets.MNIST('./data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) if not (args.validate_set): train_loader = torch.utils.data.DataLoader( full_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = None else: train_dataset = split_dataset(full_dataset, split_end=50000) val_dataset = split_dataset(full_dataset, split_start=50000) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=False, **kwargs) elif dataset == 'cifar10': normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) if args.augment: transform_train = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), (4, 4, 4, 4), mode='reflect').squeeze()), transforms.ToPILImage(), transforms.RandomCrop(32), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: transform_train = transforms.Compose([ transforms.ToTensor(), normalize, ]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) full_dataset = datasets.CIFAR10('./data', train=True, download=True, transform=transform_train) if not (args.validate_set): train_loader = torch.utils.data.DataLoader( full_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = None else: train_dataset = split_dataset(full_dataset, split_end=45000) val_dataset = split_dataset(full_dataset, split_start=45000) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.CIFAR10( './data', train=False, transform=transform_test), batch_size=args.batch_size, shuffle=True, **kwargs) elif dataset == 'imagenet': if not (args.data): raise Exception( 'need to specify imagenet dataset location using the --data argument' ) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) full_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_sampler = None if not (args.validate_set): train_loader = torch.utils.data.DataLoader( full_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = None else: train_dataset = split_dataset(full_dataset, split_end=len(full_dataset) - 10000) val_dataset = split_dataset(full_dataset, split_start=len(full_dataset) - 10000) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True) test_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) else: raise RuntimeError( 'Unknown dataset {}. Dataset is first segment of network name'. format(dataset)) print_and_log(args) with open(args.schedule_file, 'r') as stream: try: loaded_schedule = yaml.load(stream) except yaml.YAMLError as exc: print_and_log(exc) if args.model == 'mnist_mlp': model = mnist_mlp(initial_sparsity=args.initial_sparsity_fc, sparse=not (args.tied), no_batch_norm=args.no_batch_norm) elif args.model == 'cifar10_WideResNet': model = cifar10_WideResNet( args.layers, widen_factor=args.widen_factor, initial_sparsity_conv=args.initial_sparsity_conv, initial_sparsity_fc=args.initial_sparsity_fc, sub_kernel_granularity=args.sub_kernel_granularity, sparse=not (args.tied)) elif args.model == 'imagenet_resnet50': model = imagenet_resnet50( initial_sparsity_conv=args.initial_sparsity_conv, initial_sparsity_fc=args.initial_sparsity_fc, sub_kernel_granularity=args.sub_kernel_granularity, widen_factor=args.widen_factor, vanilla_conv1=True, vanilla_conv3=True, vanilla_downsample=True, sparse=not args.sparse_momentum) else: raise RuntimeError('unrecognized model name ' + repr(args.model)) model = model.cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay) if args.fp16: print_and_log('FP16') optimizer = FP16_Optimizer(optimizer, static_loss_scale=None, dynamic_loss_scale=True, dynamic_loss_args={'init_scale': 2**16}) model = model.half() mask = None if not args.dense: decay = CosineDecay(args.prune_rate, len(train_loader) * (args.epochs)) mask = Masking(optimizer, decay, prune_rate=args.prune_rate, prune_mode='magnitude', growth_mode=args.growth, redistribution_mode=args.redistribution, verbose=True, fp16=args.fp16) mask.add_module(model, density=args.density) #mask.remove_weight_partial_name('downsample', verbose=True) #mask.remove_weight('conv1.weight') if dataset == 'imagenet': print_and_log('setting up data parallel') model = torch.nn.DataParallel(model).cuda() base_model = model.module else: base_model = model # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print_and_log("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) #args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) if 'optimizer' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) print_and_log('OPTIM') mask.optimizer = optimizer print_and_log("=> loaded checkpoint '{}' ".format(args.resume)) else: print_and_log("=> no checkpoint found at '{}'".format(args.resume)) if args.copy_mask_from: if os.path.isfile(args.copy_mask_from): print_and_log("=> loading mask data '{}'".format( args.copy_mask_from)) mask_data = torch.load(args.copy_mask_from) filtered_mask_data = collections.OrderedDict([ (x, y) for (x, y) in mask_data['state_dict'].items() if 'mask' in x ]) model.load_state_dict(filtered_mask_data, strict=False) else: print_and_log("=> no mask checkpoint found at '{}'".format( args.copy_mask_from)) # get the number of model parameters model_size = base_model.get_model_size() cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() train_loss_l = [] test_loss_l = [] train_prec1_l = [] test_prec1_l = [] train_prec5_l = [] test_prec5_l = [] val_loss_l = [] val_prec1_l = [] val_prec5_l = [] prune_mode = args.prune_mode print_and_log('PRUNE MODE ' + str(prune_mode)) start_pruning_after_epoch_n = args.start_pruning_after_epoch prune_every_epoch_n = args.prune_epoch_frequency prune_iterations = args.prune_iterations post_prune_epochs = args.post_prune_epochs filename = args.model + '_' + repr(args.job_idx) n_prunes_done = 0 if prune_mode: ## Special consideration so that pruning mnist_mlp does not use less than 100 parameters in the top layer after pruning if args.prune_target_sparsity_fc > 0.9 and args.model == 'mnist_mlp': total_available_weights = (1. - args.prune_target_sparsity_fc) * ( 784 * 300 + 300 * 100 + 100 * 10) - 100 prune_target_sparsity_special = 0.9 prune_target_sparsity_fc = 1. - total_available_weights / ( 784 * 300 + 300 * 100) else: prune_target_sparsity_fc = prune_target_sparsity_special = args.prune_target_sparsity_fc prune_fraction_fc = 1.0 - (1 - prune_target_sparsity_fc)**( 1.0 / prune_iterations) prune_fraction_conv = 1.0 - (1 - args.prune_target_sparsity_conv)**( 1.0 / prune_iterations) prune_fraction_fc_special = 1.0 - ( 1 - prune_target_sparsity_special)**(1.0 / prune_iterations) cubic_pruning_multipliers = ( 1 - np.arange(prune_iterations + 1) / prune_iterations)**3.0 def get_prune_fraction_cubic(current_prune_iter, final_sparsity): return 1 - (1 - final_sparsity + final_sparsity * cubic_pruning_multipliers[current_prune_iter + 1]) / ( 1 - final_sparsity + final_sparsity * cubic_pruning_multipliers[current_prune_iter]) nEpochs_to_prune = int(start_pruning_after_epoch_n + prune_every_epoch_n * (prune_iterations - 1)) + post_prune_epochs print_and_log( 'prune fraction fc : {} , prune_fraction conv : {} '.format( prune_fraction_fc, prune_fraction_conv)) print_and_log('nepochs ' + repr(nEpochs_to_prune)) filename += '_target_' + repr( args.prune_target_sparsity_fc) + ',' + repr( args.prune_target_sparsity_conv) validate(test_loader, model, criterion, 1, 'validate') save_checkpoint( { 'model_size': base_model.get_model_size(), 'model_name': args.model, 'state_dict': model.state_dict(), 'args': args }, filename=filename + '_initial') current_iteration = 0 lr_schedule = loaded_schedule['lr_schedule'] rewire_schedule = loaded_schedule['rewire_period_schedule'] DeepR_temperature_schedule = loaded_schedule['DeepR_temperature_schedule'] threshold = 1.0e-3 if args.resume: print_and_log("Validating...") validate(test_loader, model, criterion, 1, 'validate') for epoch in range(args.start_epoch, nEpochs_to_prune if prune_mode else args.epochs): adjust_learning_rate(optimizer, epoch, lr_schedule) rewire_period = get_schedule_val(rewire_schedule, epoch) DeepR_temperature = get_schedule_val(DeepR_temperature_schedule, epoch) print_and_log('rewiring every {} iterations'.format(rewire_period)) t1 = time.time() current_iteration, threshold = train(mask, train_loader, model, criterion, optimizer, epoch, current_iteration, rewire_period, DeepR_temperature, threshold) print_and_log('epoch time ' + repr(time.time() - t1)) if prune_mode and epoch >= start_pruning_after_epoch_n and ( epoch - start_pruning_after_epoch_n ) % prune_every_epoch_n == 0 and n_prunes_done < prune_iterations: if args.cubic_prune_schedule: base_model.prune( get_prune_fraction_cubic(n_prunes_done, prune_target_sparsity_fc), get_prune_fraction_cubic(n_prunes_done, args.prune_target_sparsity_conv), get_prune_fraction_cubic(n_prunes_done, prune_target_sparsity_special)) else: base_model.prune(prune_fraction_fc, prune_fraction_conv, prune_fraction_fc_special) n_prunes_done += 1 print_and_log(base_model.get_model_size()) if not (args.no_validate_train): prec1_train, prec5_train, loss_train = validate( train_loader, model, criterion, epoch, 'train') else: prec1_train, prec5_train, loss_train = 0.0, 0.0, 0.0 if args.validate_set: prec1_val, prec5_val, loss_val = validate(val_loader, model, criterion, epoch, 'validate') else: prec1_val, prec5_val, loss_val = 0.0, 0.0, 0.0 prec1_test, prec5_test, loss_test = validate(test_loader, model, criterion, epoch, 'test') test_loss_l.append(loss_test) train_loss_l.append(loss_train) val_loss_l.append(loss_val) test_prec1_l.append(prec1_test) train_prec1_l.append(prec1_train) val_prec1_l.append(prec1_val) test_prec5_l.append(prec5_test) train_prec5_l.append(prec5_train) val_prec5_l.append(prec5_val) # remember best prec@1 and save checkpoint filenames = [filename] if epoch == args.stop_rewire_epoch: filenames += [filename + '_StopRewiringPoint_' + repr(epoch)] for f in filenames: save_checkpoint( { 'model_size': base_model.get_model_size(), 'test_loss': test_loss_l, 'train_loss': train_loss_l, 'val_loss': val_loss_l, 'test_prec1': test_prec1_l, 'train_prec1': train_prec1_l, 'val_prec1': val_prec1_l, 'test_prec5': test_prec5_l, 'train_prec5': train_prec5_l, 'val_prec5': train_prec5_l, 'model_name': args.model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch + 1, 'args': args }, filename=f) if not args.dense and epoch < args.epochs: mask.at_end_of_epoch() print_and_log('Best accuracy: ', best_prec1)
def main(): parser = argparse.ArgumentParser() # # 必要参数 parser.add_argument('--task', default='multi', type=str, help='Task affecting load data and vectorize feature') parser.add_argument( '--loss_type', default='double', type=str, help='Select loss double or single, only for multi task' ) # 针对multi任务才有效 parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help= "Bert pre-trained model selected in the list: bert-base-uncased,bert-large-uncased, " "bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,bert-base-chinese," "bert-base-multilingual-cased.") # 选择预训练模型参数 parser.add_argument("--debug", default=False, help="Whether run on small dataset") # 正常情况下都应该选择false parser.add_argument( "--output_dir", default="./SQuAD/output/", type=str, help= "The output directory where the model checkpoints and predictions will be written." ) # # 其他参数 parser.add_argument("--train_file", default="./SQuAD/version/train.json", type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default="./SQuAD/version/prediction.json", type=str, help= "SQuAD json for predictio ns. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will be " "truncated to this length.") # # 控制参数 parser.add_argument("--do_train", default=True, help="Whether to run training.") parser.add_argument("--do_predict", default=True, help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=18, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=18, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json file." ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated.This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, help= "If true, all of the warnings related to data processing will be printed.A number of " "warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.Positive power of 2: static loss scaling value.\n" ) parser.add_argument( '--version_2_with_negative', default=False, help= 'If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() # if是采用单机形式,else采用的是分布式形式;因为我们没有分布式系统,所以采用单机多GPU的方式进行训练10.24 if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='hierarchical_copy') # 以下三句话的意义不是很大,基本操作这一部分是日志的输出形式10.24 logging.basicConfig( format='%(asctime)s-%(levelname)s-%(name)s-%(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device:{}, n_gpu:{}, distributed training:{}, 16-bits training:{}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # 以下几行均是用来设置参数10.24 args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) # 设置随机种子 np.random.seed(args.seed) # 设置随机种子 torch.manual_seed(args.seed) # 为CPU设置种子用于生成随机数,以使得结果是确定的 if n_gpu > 0: # 如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子 torch.cuda.manual_seed_all(args.seed) # 以下三句又是基本操作,意义不大10.24 if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) # 以下2句是用来判断output_dir是否存在,若不存在,则创建即可(感觉有这个东西反而不太好,因为需要空文件夹)10.24 # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # 这个东西是用来干啥的(从tokenization中读取,对Tokenizer进行初始化操作)10.24 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # 从data中读取数据的方式,一种是单队列的读取方式,另一种是多通道读取方式10.24 if args.task == 'squad': read_examples = read_squad_examples elif args.task == 'multi': read_examples = read_multi_examples # 用来加载训练样例以及优化的步骤10.24 train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) if args.debug: train_examples = train_examples[:100] num_train_optimization_steps = \ int(len(train_examples)/args.train_batch_size/args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # 模型准备中ing10.24 model = BertForQuestionAnswering.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) # model = torch.nn.DataParallel(model).cuda() # 判断是否使用float16编码10.24 if args.fp16: # model.half().cuda() model.half() # 将模型加载到相应的CPU或者GPU中10.24 model.to(device) # 配置优化器等函数10.24 if args.do_train: param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from apex.fp16_utils import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=True) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # 进行模型的拟合训练10.24 global_step = 0 if args.do_train: # 训练语料的特征提取 train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) all_start_vector = torch.tensor( [f.start_vector for f in train_features], dtype=torch.float) all_end_vector = torch.tensor([f.end_vector for f in train_features], dtype=torch.float) all_content_vector = torch.tensor( [f.content_vector for f in train_features], dtype=torch.float) # # 替换的内容all_start_positions以及all_end_positions # all1_start_positions = [] # for i in range(len(train_features)): # for j in range(len(train_features[i].start_position)): # all1_start_positions.append(train_features[i].start_position[j]) # all_start_positions = torch.tensor([k for k in all1_start_positions], dtype=torch.long) # all1_end_positions = [] # for i in range(len(train_features)): # for j in range(len(train_features[i].end_position)): # all1_end_positions.append(train_features[i].end_position[j]) # all_end_positions = torch.tensor([k for k in all1_end_positions], dtype=torch.long) # #################################################################### train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_start_vector, all_end_vector, all_content_vector) if args.local_rank == -1: train_sampler = RandomSampler(train_data) # 随机采样器 else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in trange(int(args.num_train_epochs), desc="Epoch"): # 每次都叫他进行分发,这样的话,就可以进行多GPU训练 model = torch.nn.DataParallel(model).cuda() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, start_vector, end_vector, content_vector = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, start_vector, end_vector, content_vector, args.loss_type) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. print("loss率为:{}".format(loss)) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print("\n") print(ep) output_model_file = os.path.join(args.output_dir, str(ep) + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, str(ep) + CONFIG_NAME) torch.save(model.state_dict(), output_model_file) if isinstance(model, torch.nn.DataParallel): model = model.module model.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # 这个是用来加载进行微调调好后的代码以方便进行预测10.25 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) # 再次将GPU加入10.25 model.to(device) # 这部分就是进行相应的预测(用于生成预测文件) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = \ read_examples(input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) if args.debug: eval_examples = eval_examples[:100] eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) middle_result = os.path.join(args.output_dir, 'middle_result.pkl') pickle.dump([eval_examples, eval_features, all_results], open(middle_result, 'wb')) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") if (args.loss_type == 'double'): write_predictions_couple_labeling( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) elif (args.loss_type == 'single'): write_predictions_single_labeling( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) elif (args.loss_type == 'origin') or (args.task == 'multi' and args.loss_type == 'squad'): write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) else: raise ValueError('{} dataset and {} loss is not support'.format( args.task, args.loss_type))
def train(): if opt.part == 'table2pivot': corpus = Table2PivotCorpus(vocab_size=opt.vocab_size, max_len=opt.src_max_len, batch_size=opt.batch_size, log_dir=opt.dir, scale=opt.scale, mode=opt.mode) else: corpus = Pivot2TextCorpus(vocab_size=opt.vocab_size, src_max_len=opt.src_max_len, tgt_max_len=opt.tgt_max_len, batch_size=opt.batch_size, share=opt.share, log_dir=opt.dir, scale=opt.scale, append_rate=opt.append_rate, drop_rate=opt.drop_rate, blank_rate=opt.blank_rate, setting=opt.setting, mode=opt.mode, use_feature=opt.feature) model = Pivot(emb_size=opt.emb_size, key_emb_size=opt.key_emb_size, pos_emb_size=opt.pos_emb_size, hidden_size=opt.hidden_size, n_hidden=opt.n_hidden, n_block=opt.n_block, ff_size=opt.ff_size, n_head=opt.n_head, enc_layers=opt.enc_layers, dec_layers=opt.dec_layers, dropout=opt.dropout, bidirectional=opt.bidirectional, beam_size=opt.beam_size, max_decoding_step=opt.max_step, minimum_length=opt.minimum_length, label_smoothing=opt.label_smoothing, share=opt.share, part=opt.part, vocab=corpus.vocab, use_feature=opt.feature, arch=opt.arch) if opt.fp16: model.half() model.to(device) try: from apex.fp16_utils import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(model.parameters(), lr=opt.lr, bias_correction=False) if opt.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=opt.loss_scale) else: model.to(device) if opt.optimizer == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=opt.lr, initial_accumulator_value=0.1) else: optimizer = optim.Adam(model.parameters(), lr=opt.lr) learning_rate_scheduler = LearningRateWithMetricsWrapper(torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='max', patience=2)) predictor = Predictor(dataset=corpus.test_dataset, dataloader=corpus.test_loader, corpus=corpus, cuda_device=opt.gpu) trainer = Trainer(model=model, optimizer=optimizer, learning_rate_scheduler=learning_rate_scheduler, learning_rate_decay=opt.lr_decay, ema_decay=opt.ema_decay, predictor=predictor, train_loader=corpus.train_loader, train_dataset=corpus.train_dataset, validation_metric=corpus.metrics, cuda_device=opt.gpu, patience=4, num_epochs=opt.epoch, serialization_dir=corpus.log_dir, num_serialized_models_to_keep=3, summary_interval=opt.report, should_log_parameter_statistics=False, grad_norm=opt.grad_norm, fp16=opt.fp16) trainer.train()
model = models.__dict__['resnet50'](low_dim=args.low_dim) model = torch.nn.DataParallel(model).cuda() optimizer = torch.optim.SGD(model.parameters(), 0.03, momentum=0.9, weight_decay=1e-4) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss, verbose=False) optimizer.load_state_dict(checkpoint['optimizer']) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] lemniscate = checkpoint['lemniscate'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Can work with any model, but it assumes that the model has a # feature method, and a classifier method, # as in the VGG models in torchvision.
audio_conf=audio_conf, labels=labels, rnn_type=supported_rnns[rnn_type], mixed_precision=args.mixed_precision) model = model.to(device) if args.mixed_precision: model = convert_model_to_half(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=3e-4, momentum=0.9, nesterov=True, weight_decay=1e-5) if args.distributed: model = DistributedDataParallel(model) if args.mixed_precision: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) criterion = CTCLoss() seconds = int(args.seconds) batch_size = int(args.batch_size) def iteration(inputs): # targets, align half of the audio targets = torch.ones(int(batch_size * ((seconds * 100) / 2))) target_sizes = torch.empty(batch_size, dtype=torch.int).fill_(int((seconds * 100) / 2)) input_percentages = torch.ones(batch_size).fill_(1) input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
def model_main(conf, pitch_classes, time_steps_vocab): """ Run model pipeline from setup specified in <conf> Params ====== conf: dict config from conf/train_conf.yaml pitch_classes: dict Dict of drum pitch mappings (from conf/drum_pitches.yaml) time_steps_vocab: dict Dict of tick:token mappings (from conf/time_steps_vocab.yaml) """ model_conf = conf['model'] data_conf = conf['data'] if model_conf['d_embed'] < 0: model_conf['d_embed'] = model_conf['d_model'] assert model_conf[ 'ext_len'] >= 0, 'extended context length must be non-negative' assert model_conf['train_batch_size'] % model_conf['batch_chunk'] == 0 model_conf['work_dir'] = '{}-{}'.format(model_conf['work_dir'], data_conf['dataset']) model_conf['work_dir'] = os.path.join(model_conf['work_dir'], time.strftime('%Y%m%d-%H%M%S')) #logging = create_exp_dir(model_conf['work_dir'], # scripts_to_save=['train.py', 'mem_transformer.py'], debug=model_conf['debug']) logging = create_exp_dir(model_conf['work_dir'], scripts_to_save=None, debug=model_conf['debug']) # Set the random seed manually for reproducibility. #np.random.seed(model_conf['seed']) #torch.manual_seed(model_conf['seed']) if torch.cuda.is_available(): if not model_conf['cuda']: print( 'WARNING: You have a CUDA device, so you should probably run with --cuda' ) else: pass #torch.cuda.manual_seed_all(model_conf['seed']) # Validate `--fp16` option if model_conf['fp16']: if not model_conf['cuda']: print('WARNING: --fp16 requires --cuda, ignoring --fp16 option') model_conf['fp16'] = False else: try: from apex.fp16_utils import FP16_Optimizer except: print('WARNING: apex not installed, ignoring --fp16 option') model_conf['fp16'] = False device = torch.device('cuda' if model_conf['cuda'] else 'cpu') ############################################################################### # Load data ############################################################################### corpus = get_corpus(data_conf['dataset'], data_conf['data_dir'], pitch_classes, time_steps_vocab, conf['processing']) ntokens = corpus.vocab_size model_conf['n_token'] = ntokens cutoffs, tie_projs = [], [False] eval_batch_size = 10 tr_iter = corpus.get_iterator('train', model_conf['train_batch_size'], model_conf['tgt_len'], device=device, ext_len=model_conf['ext_len']) va_iter = corpus.get_iterator('valid', eval_batch_size, model_conf['tgt_len'], device=device, ext_len=model_conf['ext_len']) te_iter = corpus.get_iterator('test', eval_batch_size, model_conf['tgt_len'], device=device, ext_len=model_conf['ext_len']) ############################################################################### # Build the model ############################################################################### def init_weight(weight): if model_conf['init'] == 'uniform': nn.init.uniform_(weight, -model_conf['init_range'], model_conf['init_range']) elif model_conf['init'] == 'normal': nn.init.normal_(weight, 0.0, model_conf['init_std']) def init_bias(bias): nn.init.constant_(bias, 0.0) def weights_init(m): classname = m.__class__.__name__ if classname.find('Linear') != -1: if hasattr(m, 'weight') and m.weight is not None: init_weight(m.weight) if hasattr(m, 'bias') and m.bias is not None: init_bias(m.bias) elif classname.find('AdaptiveEmbedding') != -1: if hasattr(m, 'emb_projs'): for i in range(len(m.emb_projs)): if m.emb_projs[i] is not None: nn.init.normal_(m.emb_projs[i], 0.0, model_conf['proj_init_std']) elif classname.find('Embedding') != -1: if hasattr(m, 'weight'): init_weight(m.weight) elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: init_weight(m.cluster_weight) if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: init_bias(m.cluster_bias) if hasattr(m, 'out_projs'): for i in range(len(m.out_projs)): if m.out_projs[i] is not None: nn.init.normal_(m.out_projs[i], 0.0, model_conf['proj_init_std']) elif classname.find('LayerNorm') != -1: if hasattr(m, 'weight'): nn.init.normal_(m.weight, 1.0, model_conf['init_std']) if hasattr(m, 'bias') and m.bias is not None: init_bias(m.bias) elif classname.find('TransformerLM') != -1: if hasattr(m, 'r_emb'): init_weight(m.r_emb) if hasattr(m, 'r_w_bias'): init_weight(m.r_w_bias) if hasattr(m, 'r_r_bias'): init_weight(m.r_r_bias) if hasattr(m, 'r_bias'): init_bias(m.r_bias) def update_dropout(m): classname = m.__class__.__name__ if classname.find('Dropout') != -1: if hasattr(m, 'p'): m.p = model_conf['dropout'] def update_dropatt(m): if hasattr(m, 'dropatt'): m.dropatt.p = model_conf['dropatt'] if model_conf['restart']: with open(os.path.join(model_conf['restart_dir'], 'model.pt'), 'rb') as f: model = torch.load(f) if not model_conf['fp16']: model = model.float() model.apply(update_dropout) model.apply(update_dropatt) else: model = MemTransformerLM(ntokens, model_conf['n_layer'], model_conf['n_head'], model_conf['d_model'], model_conf['d_head'], model_conf['d_inner'], model_conf['dropout'], model_conf['dropatt'], tie_weight=model_conf['not_tied'], d_embed=model_conf['d_embed'], div_val=model_conf['div_val'], tie_projs=tie_projs, pre_lnorm=model_conf['pre_lnorm'], tgt_len=model_conf['tgt_len'], ext_len=model_conf['ext_len'], mem_len=model_conf['mem_len'], cutoffs=cutoffs, same_length=model_conf['same_length'], attn_type=model_conf['attn_type'], clamp_len=model_conf['clamp_len'], sample_softmax=model_conf['sample_softmax']) model.apply(weights_init) model.word_emb.apply( weights_init ) # ensure embedding init is not overridden by out_layer in case of weight sharing model_conf['n_all_param'] = sum([p.nelement() for p in model.parameters()]) model_conf['n_nonemb_param'] = sum( [p.nelement() for p in model.layers.parameters()]) if model_conf['fp16']: model = model.half() if model_conf['multi_gpu']: model = model.to(device) if model_conf['gpu0_bsz'] >= 0: para_model = BalancedDataParallel(model_conf['gpu0_bsz'] // model_conf['batch_chunk'], model, dim=1).to(device) else: para_model = nn.DataParallel(model, dim=1).to(device) else: para_model = model.to(device) #### optimizer if model_conf['optim'].lower() == 'sgd': if model_conf['sample_softmax'] > 0: dense_params, sparse_params = [], [] for param in model.parameters(): if param.size() == model.word_emb.weight.size(): sparse_params.append(param) else: dense_params.append(param) optimizer_sparse = optim.SGD(sparse_params, lr=model_conf['learning_rate'] * 2) optimizer = optim.SGD(dense_params, lr=model_conf['learning_rate'], momentum=model_conf['mom']) else: optimizer = optim.SGD(model.parameters(), lr=model_conf['learning_rate'], momentum=model_conf['mom']) elif model_conf['optim'].lower() == 'adam': if model_conf['sample_softmax'] > 0: dense_params, sparse_params = [], [] for param in model.parameters(): if param.size() == model.word_emb.weight.size(): sparse_params.append(param) else: dense_params.append(param) optimizer_sparse = optim.SparseAdam(sparse_params, lr=model_conf['learning_rate']) optimizer = optim.Adam(dense_params, lr=model_conf['learning_rate']) else: optimizer = optim.Adam(model.parameters(), lr=model_conf['learning_rate']) elif model_conf['optim'].lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=model_conf['learning_rate']) #### scheduler if model_conf['scheduler'] == 'cosine': # here we do not set eta_min to lr_min to be backward compatible # because in previous versions eta_min is default to 0 # rather than the default value of lr_min 1e-6 scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, model_conf['max_step'], eta_min=model_conf['eta_min']) # should use eta_min arg if model_conf['sample_softmax'] > 0: scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR( optimizer_sparse, model_conf['max_step'], eta_min=model_conf['eta_min']) # should use eta_min arg elif model_conf['scheduler'] == 'inv_sqrt': # originally used for Transformer (in Attention is all you need) def lr_lambda(step): # return a multiplier instead of a learning rate if step == 0 and model_conf['warmup_steps'] == 0: return 1. else: return 1. / (step ** 0.5) if step > model_conf['warmup_steps'] \ else step / (model_conf['warmup_steps'] ** 1.5) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) elif model_conf['scheduler'] == 'dev_perf': scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=model_conf['decay_rate'], patience=model_conf['patience'], min_lr=model_conf['lr_min']) if model_conf['sample_softmax'] > 0: scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau( optimizer_sparse, factor=model_conf['decay_rate'], patience=model_conf['patience'], min_lr=model_conf['lr_min']) elif model_conf['scheduler'] == 'constant': pass if model_conf['cuda'] and model_conf['fp16']: # If model_conf['dynamic_loss_scale'] is False, static_loss_scale will be used. # If model_conf['dynamic_loss_scale'] is True, it will take precedence over static_loss_scale. optimizer = FP16_Optimizer( optimizer, static_loss_scale=model_conf['static_loss_scale'], dynamic_loss_scale=model_conf['dynamic_loss_scale'], dynamic_loss_args={'init_scale': 2**16}) if model_conf['restart']: if os.path.exists( os.path.join(model_conf['restart_dir'], 'optimizer.pt')): with open(os.path.join(model_conf['restart_dir'], 'optimizer.pt'), 'rb') as f: opt_state_dict = torch.load(f) optimizer.load_state_dict(opt_state_dict) else: print('Optimizer was not saved. Start from scratch.') logging('=' * 100) for k, v in model_conf.items(): logging(' - {} : {}'.format(k, v)) logging('=' * 100) logging('#params = {}'.format(model_conf['n_all_param'])) logging('#non emb params = {}'.format(model_conf['n_nonemb_param'])) ############################################################################### # Training code ############################################################################### def evaluate(eval_iter): # Turn on evaluation mode which disables dropout. model.eval() # If the model does not use memory at all, make the ext_len longer. # Otherwise, make the mem_len longer and keep the ext_len the same. if model_conf['mem_len'] == 0: model.reset_length( model_conf['eval_tgt_len'], model_conf['ext_len'] + model_conf['tgt_len'] - model_conf['eval_tgt_len'], model_conf['mem_len']) else: model.reset_length( model_conf['eval_tgt_len'], model_conf['ext_len'], model_conf['mem_len'] + model_conf['tgt_len'] - model_conf['eval_tgt_len']) # Evaluation total_len, total_loss = 0, 0. with torch.no_grad(): mems = tuple() for i, (data, target, seq_len) in enumerate(eval_iter): if model_conf['max_eval_steps'] > 0 and i >= model_conf[ 'max_eval_steps']: break ret = model(data, target, *mems) loss, mems = ret[0], ret[1:] loss = loss.mean() total_loss += seq_len * loss.float().item() total_len += seq_len # Switch back to the training mode model.reset_length(model_conf['tgt_len'], model_conf['ext_len'], model_conf['mem_len']) model.train() return total_loss / total_len def train(): # Turn on training mode which enables dropout. global train_step, train_loss, best_val_loss, eval_start_time, log_start_time model.train() if model_conf['batch_chunk'] > 1: mems = [tuple() for _ in range(model_conf['batch_chunk'])] else: mems = tuple() train_iter = tr_iter.get_varlen_iter( ) if model_conf['varlen'] else tr_iter for batch, (data, target, seq_len) in enumerate(train_iter): model.zero_grad() if model_conf['batch_chunk'] > 1: data_chunks = torch.chunk(data, model_conf['batch_chunk'], 1) target_chunks = torch.chunk(target, model_conf['batch_chunk'], 1) for i in range(model_conf['batch_chunk']): data_i = data_chunks[i].contiguous() target_i = target_chunks[i].contiguous() ret = para_model(data_i, target_i, *mems[i]) loss, mems[i] = ret[0], ret[1:] loss = loss.float().mean().type_as( loss) / model_conf['batch_chunk'] if model_conf['fp16']: optimizer.backward(loss) else: loss.backward() train_loss += loss.float().item() else: ret = para_model(data, target, *mems) loss, mems = ret[0], ret[1:] loss = loss.float().mean().type_as(loss) if model_conf['fp16']: optimizer.backward(loss) else: loss.backward() train_loss += loss.float().item() if model_conf['fp16']: optimizer.clip_master_grads(model_conf['clip']) else: torch.nn.utils.clip_grad_norm_(model.parameters(), model_conf['clip']) optimizer.step() if model_conf['sample_softmax'] > 0: optimizer_sparse.step() # step-wise learning rate annealing train_step += 1 if model_conf['scheduler'] in ['cosine', 'constant', 'dev_perf']: # linear warmup stage if train_step < model_conf['warmup_steps']: curr_lr = model_conf[ 'learning_rate'] * train_step / model_conf[ 'warmup_steps'] optimizer.param_groups[0]['lr'] = curr_lr if model_conf['sample_softmax'] > 0: optimizer_sparse.param_groups[0][ 'learning_rate'] = curr_lr * 2 else: if model_conf['scheduler'] == 'cosine': scheduler.step(train_step) if model_conf['sample_softmax'] > 0: scheduler_sparse.step(train_step) elif model_conf['scheduler'] == 'inv_sqrt': scheduler.step(train_step) if train_step % model_conf['log_interval'] == 0: cur_loss = train_loss / model_conf['log_interval'] elapsed = time.time() - log_start_time log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \ '| ms/batch {:5.2f} | loss {:5.2f}'.format( epoch, train_step, batch+1, optimizer.param_groups[0]['lr'], elapsed * 1000 / model_conf['log_interval'], cur_loss) log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss)) logging(log_str) train_loss = 0 log_start_time = time.time() if train_step == 1 or train_step % model_conf['eval_interval'] == 0: val_loss = evaluate(va_iter) logging('-' * 100) log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \ '| valid loss {:5.2f}'.format( train_step // model_conf['eval_interval'], train_step, (time.time() - eval_start_time), val_loss) log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss)) logging(log_str) logging('-' * 100) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: create_dir_if_not_exists( os.path.join(model_conf['work_dir'], f'train_step_{train_step}', '')) if not model_conf['debug']: with open( os.path.join(model_conf['work_dir'], f'train_step_{train_step}', 'model.pt'), 'wb') as f: torch.save(model, f) with open( os.path.join(model_conf['work_dir'], f'train_step_{train_step}', 'optimizer.pt'), 'wb') as f: torch.save(optimizer.state_dict(), f) best_val_loss = val_loss # dev-performance based learning rate annealing if model_conf['scheduler'] == 'dev_perf': scheduler.step(val_loss) if model_conf['sample_softmax'] > 0: scheduler_sparse.step(val_loss) eval_start_time = time.time() if train_step == model_conf['max_step']: break # Loop over epochs. train_step = 0 train_loss = 0 best_val_loss = None log_start_time = time.time() eval_start_time = time.time() # At any point you can hit Ctrl + C to break out of training early. try: for epoch in itertools.count(start=1): train() if train_step == model_conf['max_step']: logging('-' * 100) logging('End of training') break except KeyboardInterrupt: logging('-' * 100) logging('Exiting from training early') create_dir_if_not_exists(model_conf['work_dir']) # Load the best saved model. with open(os.path.join(model_conf['work_dir'], 'model.pt'), 'rb') as f: model = torch.load(f) para_model = model.to(device) # Run on test data. test_loss = evaluate(te_iter) logging('=' * 100) logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format( test_loss, math.exp(test_loss))) logging('=' * 100)
patience=args.patience, min_lr=args.lr_min) if args.sample_softmax > 0: scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau( optimizer_sparse, factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) elif args.scheduler == 'constant': pass if args.cuda and args.fp16: # If args.dynamic_loss_scale is False, static_loss_scale will be used. # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale. optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={'init_scale': 2**16}) if args.restart: if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')): with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f: opt_state_dict = torch.load(f) optimizer.load_state_dict(opt_state_dict) else: print('Optimizer was not saved. Start from scratch.') logging('=' * 100) for k, v in args.__dict__.items(): logging(' - {} : {}'.format(k, v)) logging('=' * 100) logging('#params = {}'.format(args.n_all_param))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--dev_batch_size", default=8, type=int, help="Total batch size for develop") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=3000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument('--model_path', type=str, default='./model', help='save model path') parser.add_argument('--load_model', type=str, default=None) parser.add_argument('--embedding_dim', type=int, default=300) parser.add_argument('--dropout_prob', type=float, default=0.2) args = parser.parse_args() processors = {"memory": MemoryProcessor, "logic": LogicalProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device((args.local_rank)) device = "cuda" n_gpu = torch.cuda.device_count() # Initializes the distributed backend which will take care of sychronizing nodes/GPUs dist.init_process_group(backend='nccl') torch.backends.cudnn.benchmark = True if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) vocab_dim = len(tokenization.load_vocab(args.vocab_file)) model = SequenceClassification(vocab_dim, args.embedding_dim, args.dropout_prob, len(label_list), device) if args.load_model is not None: model.load_state_dict(torch.load(args.load_model, map_location='cpu')) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) global_step = 0 if args.local_rank != -1: model = DDP(model) optimizer = FP16_Optimizer(optimizer) ''' model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) ''' elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: #train feature train_features = convert_to_ids(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_q_ids = torch.tensor([f.que_ids for f in train_features], dtype=torch.long) all_d_ids = torch.tensor([f.des_ids for f in train_features], dtype=torch.long) all_sd_ids = torch.tensor([f.scene_ids for f in train_features], dtype=torch.long) #all_Ld_ids = torch.tensor([f.local_scene_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_q_ids, all_d_ids, all_sd_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, num_workers=1, sampler=train_sampler, batch_size=args.train_batch_size) #developset feature dev_exmaples = processor.get_dev_examples(args.data_dir) dev_features = convert_to_ids(dev_exmaples, label_list, args.max_seq_length, tokenizer) all_dev_q_ids = torch.tensor([f.que_ids for f in dev_features], dtype=torch.long) all_dev_d_ids = torch.tensor([f.des_ids for f in dev_features], dtype=torch.long) all_dev_sd_ids = torch.tensor([f.scene_ids for f in dev_features], dtype=torch.long) #all_dev_Ld_ids = torch.tensor([f.local_scene_ids for f in dev_features], dtype=torch.long) all_dev_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(all_dev_q_ids, all_dev_d_ids, all_dev_sd_ids, all_dev_label_ids) if args.local_rank == -1: dev_sampler = RandomSampler(dev_data) else: dev_sampler = DistributedSampler(dev_data) dev_dataloader = DataLoader(dev_data, num_workers=1, sampler=dev_sampler, batch_size=args.eval_batch_size) model.train() losses = [] dev_accuracy_list = [] dev_losses = [] for epoch in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for (q_ids, d_ids, sd_ids, label_ids) in (train_dataloader): optimizer.zero_grad() q_ids = q_ids.to(device) d_ids = d_ids.to(device) sd_ids = sd_ids.to(device) #Ld_ids = Ld_ids.to(device) label_ids = label_ids.to(device) loss, _ = model.forward(q_ids, d_ids, sd_ids, label_ids) tr_loss += loss.item() nb_tr_examples += q_ids.size(0) nb_tr_steps += 1 loss.backward() optimizer.step() global_step += 1 if (epoch + 1) % 10 == 0: if args.task_name == 'memory': torch.save( model.state_dict(), os.path.join( args.model_path, 'non_crossPassage_res_memory_model' + str(epoch + 1) + '.bin')) else: torch.save( model.state_dict(), os.path.join( args.model_path, 'non_crossPassage_res_logic_model' + str(epoch + 1) + '.bin')) losses.append(tr_loss / nb_tr_steps) #develop dataset evaluation dev_accuracy, nb_dev_examples = 0, 0 for q_ids, d_ids, sd_ids, label_ids in dev_dataloader: q_ids = q_ids.to(device) d_ids = d_ids.to(device) sd_ids = sd_ids.to(device) #Ld_ids = Ld_ids.to(device) label_ids = label_ids.to(device) dev_loss, logits = model.forward(q_ids, d_ids, sd_ids, label_ids) label_ids = label_ids.to('cpu').numpy() logits = logits.to('cpu').detach().numpy() tmp_dev_accuracy = accuracy(logits, label_ids) dev_accuracy += tmp_dev_accuracy nb_dev_examples += q_ids.size(0) print('-' * 20) print("Epochs : {}".format(epoch + 1)) print("dev_accuracy : {}".format(dev_accuracy / nb_dev_examples)) print("train Loss : {}".format(tr_loss / nb_tr_steps)) print("validataion Loss : {}".format(dev_loss.item())) dev_losses.append(dev_loss.item()) print('-' * 20) if args.do_eval: eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_to_ids(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_q_vectors = torch.tensor([f.que_ids for f in eval_features], dtype=torch.long) all_d_vectors = torch.tensor([f.des_ids for f in eval_features], dtype=torch.long) all_sd_vectors = torch.tensor([f.scene_ids for f in eval_features], dtype=torch.long) #all_Ld_vectors = torch.tensor([f.local_scene_ids for f in eval_features], dtype=torch.long) all_label_vectors = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_q_vectors, all_d_vectors, all_sd_vectors, all_label_vectors) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, num_workers=1, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logit_label_list = [] for step, (q_vec, d_vec, sd_vec, label_vec) in enumerate( tqdm(eval_dataloader, desc="Iteration")): q_vec = q_vec.to(device) d_vec = d_vec.to(device) sd_vec = sd_vec.to(device) #Ld_vec = Ld_vec.to(device) label_vec = label_vec.to(device) tmp_eval_loss, logits = model.forward(q_vec, d_vec, sd_vec, label_vec) label_ids = label_vec.to('cpu').numpy() logits = logits.to('cpu').detach().numpy() tmp_eval_accuracy = accuracy(logits, label_ids) output = np.argmax(logits, axis=1) list(output) list(label_ids) logit_label_list.append([output, label_ids]) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += q_vec.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps # len(eval_dataloader) eval_accuracy = eval_accuracy / nb_eval_examples # len(eval_dataloader) result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step } #'loss': tr_loss / nb_tr_steps} # 'loss': loss.item()} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open('[memory]align_epoch20_output', 'w') as f: logit_output_list = [] Gold_output_list = [] for labels in logit_label_list: for logit in labels[0]: logit_output = convert_id_to_label(logit, label_list) logit_output_list.append(logit_output) for Gold in labels[1]: Gold_output = convert_id_to_label(Gold, label_list) Gold_output_list.append(Gold_output) for logit, gold in zip(logit_output_list, Gold_output_list): f.write(str(logit) + '\t' + str(gold) + '\n') with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))