def _train_eval(diff_vec_class, desc_vec_class, alp): metrics = "accuracy,recall,distance,nlg" suf = "{}_{}_{}".format(_get_class_name(diff_vec_class), _get_class_name(desc_vec_class), alp) out_file = os.path.join(os.path.dirname(valid_file), "nnupdater_result_{}.json".format(suf)) slice_test_main(train_file, valid_file, out_file, alpha=alp, slice_count=slice_count, diff_vec_class=diff_vec_class, desc_vec_class=desc_vec_class) evaluator = Evaluator(args={ "--metrics": metrics, "TEST_SET": valid_file, "RESULT_FILE": out_file }) r, lemma_r = evaluator.evaluate() return suf, r, lemma_r
def main(): args = parse_args() transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.ERROR) if args.predict_file is None and args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if args.output_dir and os.path.exists(args.output_dir) and \ os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) if args.overwrite_output_dir and os.path.isdir(args.output_dir): shutil.rmtree(args.output_dir) os.mkdir(args.output_dir) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) with open(os.path.join(args.output_dir, 'args.txt'), 'w') as f: f.write(str(args)) for key, val in vars(args).items(): logger.info(f"{key} - {val}") try: write_meta_data(args.output_dir, args) except git.exc.InvalidGitRepositoryError: logger.info("didn't save metadata - No git repo!") logger.info( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, amp training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.amp) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Barrier to make sure only the first process in distributed training download model & vocab torch.distributed.barrier() if args.config_name: config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") #config.hidden_dropout_prob = args.encoder_dropout_prob if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") config_class = LongformerConfig base_model_prefix = "longformer" S2E.config_class = config_class S2E.base_model_prefix = base_model_prefix model = S2E.from_pretrained(args.model_name_or_path, config=config, cache_dir=args.cache_dir, args=args) model.to(args.device) if args.local_rank == 0: # End of barrier to make sure only the first process in distributed training download model & vocab torch.distributed.barrier() logger.info("Training/evaluation parameters %s", args) evaluator = Evaluator(args, tokenizer) # Training if args.do_train: train_dataset = get_dataset(args, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer, evaluator) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, # you can reload them using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: result = evaluator.evaluate(model, prefix="final_evaluation", official=True) results.update(result) return results return results
def test(args): """Run model testing.""" test_args = args.test_args logger_args = args.logger_args # Load the model at ckpt_path. ckpt_path = test_args.ckpt_path ckpt_save_dir = Path(ckpt_path).parent # Get model args from checkpoint and add them to # command-line specified model args. model_args, data_args, optim_args, logger_args\ = ModelSaver.get_args(cl_logger_args=logger_args, ckpt_save_dir=ckpt_save_dir) model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path, gpu_ids=args.gpu_ids, model_args=model_args, is_training=False) # Get logger. logger = Logger(logger_args=logger_args, data_args=data_args, optim_args=optim_args, test_args=test_args) # Instantiate the Predictor class for obtaining model predictions. predictor = Predictor(model=model, device=args.device) phase = test_args.phase is_test = False if phase == 'test': is_test = True phase = 'valid' # Run valid first to get threshold print(f"======================{phase}=======================") # Get phase loader object. loader = get_loader(phase=phase, data_args=data_args, is_training=False, logger=logger) # Obtain model predictions. predictions, groundtruth = predictor.predict(loader) # Instantiate the evaluator class for evaluating models. evaluator = Evaluator(logger=logger, tune_threshold=True) # Get model metrics and curves on the phase dataset. metrics = evaluator.evaluate(groundtruth, predictions) # Log metrics to stdout and file. logger.log_stdout(f"Writing metrics to {logger.metrics_path}.") logger.log_metrics(metrics, phase=phase) # Evaluate dense to get back thresholds dense_loader = get_loader(phase=phase, data_args=data_args, is_training=False, logger=logger) dense_predictions, dense_groundtruth = predictor.predict(dense_loader) dense_metrics = evaluator.dense_evaluate(dense_groundtruth, dense_predictions) # Log metrics to stdout and file. logger.log_stdout(f"Writing metrics to {logger.metrics_path}.") logger.log_metrics(dense_metrics, phase=phase) if is_test: phase = 'test' threshold = metrics['threshold'] print(f"======================{phase}=======================") # Get phase loader object. loader = get_loader(phase=phase, data_args=data_args, is_training=False, test_args=test_args, logger=logger) # Obtain model predictions. predictions, groundtruth = predictor.predict(loader) # Instantiate the evaluator class for evaluating models. evaluator = Evaluator(logger=logger, threshold=threshold, tune_threshold=False) # Get model metrics and curves on the phase dataset. metrics = evaluator.evaluate(groundtruth, predictions) # Log metrics to stdout and file. logger.log_stdout(f"Writing metrics to {logger.metrics_path}.") logger.log_metrics(metrics, phase=phase) # Dense test phase = 'dense_test' dense_loader = get_loader(phase=phase, data_args=data_args, is_training=False, test_args=test_args, logger=logger) threshold_dense = dense_metrics["threshold_dense"] threshold_tunef1_dense = dense_metrics["threshold_tunef1_dense"] dense_predictions, dense_groundtruth = predictor.predict(dense_loader) dense_metrics = evaluator.dense_evaluate( dense_groundtruth, dense_predictions, threshold=threshold_dense, threshold_tunef1=threshold_tunef1_dense) logger.log_stdout(f"Writing metrics to {logger.metrics_path}.") logger.log_metrics(dense_metrics, phase=phase)
def main(args): # 1.初始化设置 np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # 为GPU设置随机数种子 cudnn.benchmark = True # 在程序刚开始加这条语句可以提升一点训练速度,没什么额外开销 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 2.日志文件 log if args.evaluate == 1: sys.stdout = Logger(osp.join(args.logs_dir, 'log_test.txt')) else: sys.stdout = Logger(osp.join(args.logs_dir, 'log_train.txt')) # 3.数据集 dataset, numclasses, train_loader, query_loader, gallery_loader = getdata( args.dataset, args.split, args.batch_size, args.seq_len, args.seq_srd, args.workers) # 4.建立网络 cnn_rnn_model = models.creat(args.a1, 16, 32, 32, numclasses, num_features=args.features, seq_len=args.seq_len, batch=args.batch_size).to(device) cnn_rnn_model.initialize_weights() # param_groups = [ # {'params': cnn_rnn_model.parameters(), 'lr_mult': 1},] criterion = Criterion(args.hingeMargin).to(device) optimizer = optim.SGD(cnn_rnn_model.parameters(), lr=args.lr1, momentum=args.momentum, weight_decay=args.weight_decay) # def adjust_lr(epoch): # if epoch <= 20: # lr = 0.01 # elif epoch > 20 & epoch <= 200: # lr = 0.001 # else: # lr = 0.0001 # for g in optimizer.param_groups: # g['lr'] = lr * g.get('lr_mult', 1) # 5.trainer实例化 trainer = SEQTrainer(cnn_rnn_model, criterion) # 6.evaluate类的实例化 evaluator = Evaluator(cnn_rnn_model) best_top1 = 0 # 6.进入训练/测试模式 if args.evaluate == 1: checkpoint = load_checkpoint( osp.join(args.logs_dir, 'model_best.pth.tar')) cnn_rnn_model.load_state_dict(checkpoint['state_dict']) rank1 = evaluator.evaluate(query_loader, gallery_loader, dataset.queryinfo, dataset.galleryinfo) else: cnn_rnn_model.train() for epoch in range(args.start_epoch, args.epochs): # adjust_lr(epoch) trainer.train(epoch, train_loader, optimizer) # 每隔几个epoch测试一次 if (epoch + 1) % 20 == 0 or (epoch + 1) == args.epochs: top1 = evaluator.evaluate(query_loader, gallery_loader, dataset.queryinfo, dataset.galleryinfo) is_best = top1 > best_top1 if is_best: best_top1 = top1 save_checkpoint( { 'state_dict': cnn_rnn_model.state_dict(), 'epoch': epoch + 1, 'best_top1': best_top1, }, is_best, fpath=osp.join(args.logs_dir, 'cnn_checkpoint.pth.tar'))
def train(args): """Run model training.""" # Get nested namespaces. model_args = args.model_args logger_args = args.logger_args optim_args = args.optim_args data_args = args.data_args # Get logger. logger = Logger(logger_args) if model_args.ckpt_path: # CL-specified args are used to load the model, rather than the # ones saved to args.json. model_args.pretrained = False ckpt_path = model_args.ckpt_path assert False model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path, gpu_ids=args.gpu_ids, model_args=model_args, is_training=True) optim_args.start_epoch = ckpt_info['epoch'] + 1 else: # If no ckpt_path is provided, instantiate a new randomly # initialized model. model_fn = models.__dict__[model_args.model] model = model_fn(model_args) model = nn.DataParallel(model, args.gpu_ids) # Put model on gpu or cpu and put into training mode. model = model.to(args.device) model.train() # Get train and valid loader objects. train_loader = get_loader(phase="train", data_args=data_args, is_training=True, logger=logger) valid_loader = get_loader(phase="valid", data_args=data_args, is_training=False, logger=logger) dense_valid_loader = get_loader(phase="dense_valid", data_args=data_args, is_training=False, logger=logger) # Instantiate the predictor class for obtaining model predictions. predictor = Predictor(model, args.device) # Instantiate the evaluator class for evaluating models. # By default, get best performance on validation set. evaluator = Evaluator(logger=logger, tune_threshold=True) # Instantiate the saver class for saving model checkpoints. saver = ModelSaver(save_dir=logger_args.save_dir, iters_per_save=logger_args.iters_per_save, max_ckpts=logger_args.max_ckpts, metric_name=optim_args.metric_name, maximize_metric=optim_args.maximize_metric, keep_topk=True, logger=logger) # Instantiate the optimizer class for guiding model training. optimizer = Optimizer(parameters=model.parameters(), optim_args=optim_args, batch_size=data_args.batch_size, iters_per_print=logger_args.iters_per_print, iters_per_visual=logger_args.iters_per_visual, iters_per_eval=logger_args.iters_per_eval, dataset_len=len(train_loader.dataset), logger=logger) if model_args.ckpt_path: # Load the same optimizer as used in the original training. optimizer.load_optimizer(ckpt_path=model_args.ckpt_path, gpu_ids=args.gpu_ids) loss_fn = evaluator.get_loss_fn(loss_fn_name=optim_args.loss_fn) # Run training while not optimizer.is_finished_training(): optimizer.start_epoch() for inputs, targets in train_loader: optimizer.start_iter() if optimizer.global_step % optimizer.iters_per_eval == 0: # Only evaluate every iters_per_eval examples. predictions, groundtruth = predictor.predict(valid_loader) metrics = evaluator.evaluate(groundtruth, predictions) # Evaluate on dense dataset dense_predictions, dense_groundtruth = predictor.predict( dense_valid_loader) dense_metrics = evaluator.dense_evaluate( dense_groundtruth, dense_predictions) # Merge the metrics dicts together metrics = {**metrics, **dense_metrics} # Log metrics to stdout. logger.log_metrics(metrics, phase='valid') # Log to tb logger.log_scalars(metrics, optimizer.global_step, phase='valid') if optimizer.global_step % logger_args.iters_per_save == 0: # Only save every iters_per_save examples directly # after evaluation. saver.save(iteration=optimizer.global_step, epoch=optimizer.epoch, model=model, optimizer=optimizer, device=args.device, metric_val=metrics[optim_args.metric_name]) # Step learning rate scheduler. optimizer.step_scheduler(metrics[optim_args.metric_name]) with torch.set_grad_enabled(True): # Run the minibatch through the model. logits = model(inputs.to(args.device)) # Compute the minibatch loss. loss = loss_fn(logits, targets.to(args.device)) # Log the data from this iteration. optimizer.log_iter(inputs, logits, targets, loss) # Perform a backward pass. optimizer.zero_grad() loss.backward() optimizer.step() optimizer.end_iter() optimizer.end_epoch(metrics) # Save the most recent model. saver.save(iteration=optimizer.global_step, epoch=optimizer.epoch, model=model, optimizer=optimizer, device=args.device, metric_val=metrics[optim_args.metric_name])