def setup_training(args): assert (torch.cuda.is_available()) if args.local_rank == -1: device = torch.device("cuda") args.n_gpu = torch.cuda.device_count() args.allreduce_post_accumulation = False args.allreduce_post_accumulation_fp16 = False else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='smddp', init_method='env://') args.n_gpu = 1 if args.gradient_accumulation_steps == 1: args.allreduce_post_accumulation = False args.allreduce_post_accumulation_fp16 = False if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) print( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if args.train_batch_size % args.gradient_accumulation_steps != 0: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible" .format(args.gradient_accumulation_steps, args.train_batch_size)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.do_train: raise ValueError(" `do_train` must be True.") if not args.resume_from_checkpoint and os.path.exists( args.output_dir) and (os.listdir(args.output_dir) and any( [i.startswith('ckpt') for i in os.listdir(args.output_dir)])): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process(): os.makedirs(args.output_dir, exist_ok=True) return device, args
def main(): args = parse_args() dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) dllogger.log(data=vars(args), step='PARAMETER') model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) model = model.cuda() if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) model.load_state_dict(state_dict) if args.opt_level == "O2": model = amp.initialize(model, opt_level=args.opt_level, keep_batchnorm_fp32=False, loss_scale='dynamic') model.eval() users = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_users) items = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_items) latencies = [] for _ in range(args.num_batches): torch.cuda.synchronize() start = time.time() predictions = model(users, items, sigmoid=True) torch.cuda.synchronize() latencies.append(time.time() - start) dllogger.log(data={ 'batch_size': args.batch_size, 'best_inference_throughput': args.batch_size / min(latencies), 'best_inference_latency': min(latencies), 'mean_inference_throughput': args.batch_size / np.mean(latencies), 'mean_inference_latency': np.mean(latencies), 'inference_latencies': latencies }, step=tuple()) dllogger.flush() return
def init_logging(log_path): json_backend = dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path) stdout_backend = dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) stdout_backend._metadata['best_auc'].update({'format': '0:.5f'}) stdout_backend._metadata['best_epoch'].update({'format': '0:.2f'}) stdout_backend._metadata['average_train_throughput'].update({'format': ':.2e'}) stdout_backend._metadata['average_test_throughput'].update({'format': ':.2e'}) dllogger.init(backends=[json_backend, stdout_backend])
def setup_logging(args): logging.basicConfig(level=logging.DEBUG, format='{asctime}:{levelname}: {message}', style='{') if hvd.rank() == 0: dllogger.init(backends=[ dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT, step_format=format_step), dllogger.JSONStreamBackend( dllogger.Verbosity.VERBOSE, os.path.join(args.workspace, args.dllogger_log)), ]) else: dllogger.init([])
def setup_training(args): assert (torch.cuda.is_available()) global ort_supplement import ort_supplement.ort_supplement as ort_supplement device = ort_supplement.setup_onnxruntime_with_mpi(args) if is_main_process(args): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) print( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if args.train_batch_size % args.gradient_accumulation_steps != 0: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible" .format(args.gradient_accumulation_steps, args.train_batch_size)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.do_train: raise ValueError(" `do_train` must be True.") if not args.resume_from_checkpoint and os.path.exists( args.output_dir) and (os.listdir(args.output_dir) and any( [i.startswith('ckpt') for i in os.listdir(args.output_dir)])): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process(args): os.makedirs(args.output_dir, exist_ok=True) return device, args
def main(): args = parse_args() dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) dllogger.log(data=vars(args), step='PARAMETER') model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) model = model.cuda() if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) model.load_state_dict(state_dict) if args.fp16: model.half() model.eval() batch_sizes = args.batch_sizes.split(',') batch_sizes = [int(s) for s in batch_sizes] result_data = {} for batch_size in batch_sizes: print('benchmarking batch size: ', batch_size) users = torch.cuda.LongTensor(batch_size).random_(0, args.n_users) items = torch.cuda.LongTensor(batch_size).random_(0, args.n_items) latencies = [] for _ in range(args.num_batches): torch.cuda.synchronize() start = time.time() _ = model(users, items, sigmoid=True) torch.cuda.synchronize() latencies.append(time.time() - start) result_data[f'batch_{batch_size}_mean_throughput'] = batch_size / np.mean(latencies) result_data[f'batch_{batch_size}_mean_latency'] = np.mean(latencies) result_data[f'batch_{batch_size}_p90_latency'] = np.percentile(latencies, 0.90) result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(latencies, 0.95) result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(latencies, 0.99) dllogger.log(data=result_data, step=tuple()) dllogger.flush() return
def __init__(self, name, json_output=None, print_freq=20): self.name = name self.train_loss_logger = IterationAverageMeter("Training loss") self.train_epoch_time_logger = EpochMeter("Training 1 epoch time") self.val_acc_logger = EpochMeter("Validation accuracy") self.print_freq = print_freq backends = [DLLogger.StdOutBackend(DLLogger.Verbosity.DEFAULT)] if json_output: backends.append( DLLogger.JSONStreamBackend(DLLogger.Verbosity.VERBOSE, json_output)) DLLogger.init(backends) self.epoch = 0 self.train_iter = 0 self.summary = {}
def init_logger(args, full, logger): if full: logger.setLevel(logging.INFO) log_path = os.path.join(args.results_dir, args.log_filename) os.makedirs(args.results_dir, exist_ok=True) dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) logger.warning('command line arguments: {}'.format(json.dumps(vars(args)))) if not os.path.exists(args.results_dir): os.mkdir(args.results_dir) with open('{}/args.json'.format(args.results_dir), 'w') as f: json.dump(vars(args), f, indent=4) else: logger.setLevel(logging.ERROR) dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER')
def init_logging(log_path, FLAGS): json_backend = dllogger.JSONStreamBackend( verbosity=dllogger.Verbosity.VERBOSE, filename=log_path) stdout_backend = dllogger.StdOutBackend( verbosity=dllogger.Verbosity.VERBOSE) stdout_backend._metadata['auc'].update({'format': '0:.5f'}) stdout_backend._metadata['throughput'].update({'format': ':.2e'}) stdout_backend._metadata['mean_step_time_ms'].update({'format': '0:.3f'}) stdout_backend._metadata['mean_inference_throughput'].update( {'format': ':.2e'}) stdout_backend._metadata['mean_inference_latency'].update( {'format': '0:.5f'}) for percentile in [90, 95, 99]: stdout_backend._metadata[f'p{percentile}_inference_latency'].update( {'format': '0:.5f'}) dllogger.init(backends=[json_backend, stdout_backend]) if hvd.rank() == 0: dllogger.log(data=FLAGS.flag_values_dict(), step='PARAMETER') print("Command line flags:") print(json.dumps(FLAGS.flag_values_dict(), indent=4))
def setup_logger(args): os.makedirs(args.log_dir, exist_ok=True) if not args.json_summary: log_path = os.path.join(args.log_dir, 'dllogger_rank{}.log'.format(get_rank())) else: log_path = "{}_rank{}".format(args.json_summary, get_rank()) if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=1, filename=log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=1, filename=log_path) ]) for k, v in vars(args).items(): dllogger.log(step='PARAMETER', data={k: v}, verbosity=0) container_setup_info = { 'NVIDIA_TENSORFLOW_VERSION': os.environ.get('NVIDIA_TENSORFLOW_VERSION'), 'TENSORFLOW_VERSION': os.environ.get('TENSORFLOW_VERSION'), 'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'), 'NCCL_VERSION': os.environ.get('NCCL_VERSION'), 'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'), 'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'), 'CUDA_VERSION': os.environ.get('CUDA_VERSION'), 'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'), 'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'), 'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'), } dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)
if __name__ == "__main__": tf.logging.set_verbosity(tf.logging.ERROR) FLAGS = parse_cmdline(model_architectures.keys()) hvd.init() if hvd.rank() == 0: log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename) os.makedirs(FLAGS.results_dir, exist_ok=True) dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(FLAGS), step='PARAMETER') runner = Runner( # ========= Model HParams ========= # n_classes=1001, architecture=FLAGS.arch, input_format='NHWC', compute_format=FLAGS.data_format, dtype=tf.float32, n_channels=3, height=224 if FLAGS.data_dir else FLAGS.synthetic_data_size, width=224 if FLAGS.data_dir else FLAGS.synthetic_data_size,
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/workspace/object_detection/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=os.getenv('LOCAL_RANK', 0)) parser.add_argument("--json-summary", help="Out file for DLLogger", default="dllogger_inference.out", type=str) parser.add_argument( "--skip-eval", dest="skip_eval", help="Do not eval the predictions", action="store_true", ) parser.add_argument( "--fp16", help="Mixed precision training", action="store_true", ) parser.add_argument( "--amp", help="Mixed precision training", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() args.fp16 = args.fp16 or args.amp num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) save_dir = "" dllogger.log(step="PARAMETER", data={"config": cfg}) dllogger.log(step="PARAMETER", data={"gpu_count": num_gpus}) # dllogger.log(step="PARAMETER", data={"env_info": collect_env_info()}) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) # Initialize mixed-precision if necessary if args.fp16: use_mixed_precision = True else: use_mixed_precision = cfg.DTYPE == "float16" amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) results = [] for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): result = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, skip_eval=args.skip_eval, dllogger=dllogger, ) synchronize() results.append(result) if is_main_process() and not args.skip_eval: map_results, raw_results = results[0] bbox_map = map_results.results["bbox"]['AP'] segm_map = map_results.results["segm"]['AP'] dllogger.log(step=tuple(), data={ "BBOX_mAP": bbox_map, "MASK_mAP": segm_map })
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--input_dir", default=None, type=str, required=True, help="The input data dir. Should contain .hdf5 files for the task.") parser.add_argument("--config_file", default="bert_config.json", type=str, required=False, help="The BERT model config") ckpt_group = parser.add_mutually_exclusive_group(required=True) ckpt_group.add_argument("--ckpt_dir", default=None, type=str, help="The ckpt directory, e.g. /results") ckpt_group.add_argument("--ckpt_path", default=None, type=str, help="Path to the specific checkpoint") group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--eval', dest='do_eval', action='store_true') group.add_argument('--prediction', dest='do_eval', action='store_false') ## Other parameters parser.add_argument( "--bert_model", default="bert-large-uncased", type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--max_predictions_per_seq", default=80, type=int, help="The maximum total of masked tokens in input sequence") parser.add_argument("--ckpt_step", default=-1, type=int, required=False, help="The model checkpoint iteration, e.g. 1000") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument( "--max_steps", default=-1, type=int, help= "Total number of eval steps to perform, otherwise use full dataset") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument("--log_path", help="Out file for DLLogger", default="/workspace/dllogger_inference.out", type=str) args = parser.parse_args() if 'LOCAL_RANK' in os.environ: args.local_rank = int(os.environ['LOCAL_RANK']) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', init_method='env://') if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) n_gpu = torch.cuda.device_count() if n_gpu > 1: assert (args.local_rank != -1 ) # only use torch.distributed for multi-gpu dllogger.log( step= "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16), data={}) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Prepare model config = BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) model = BertForPreTraining(config) if args.ckpt_dir: if args.ckpt_step == -1: #retrieve latest model model_names = [ f for f in os.listdir(args.ckpt_dir) if f.endswith(".pt") ] args.ckpt_step = max([ int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names ]) dllogger.log(step="load model saved at iteration", data={"number": args.ckpt_step}) model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".pt") else: model_file = args.ckpt_path state_dict = torch.load(model_file, map_location="cpu")["model"] model.load_state_dict(state_dict, strict=False) if args.fp16: model.half( ) # all parameters and buffers are converted to half precision model.to(device) multi_gpu_training = args.local_rank != -1 and torch.distributed.is_initialized( ) if multi_gpu_training: model = DDP(model) files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'test' in f ] files.sort() dllogger.log(step="***** Running Inference *****", data={}) dllogger.log(step=" Inference batch", data={"size": args.eval_batch_size}) model.eval() nb_instances = 0 max_steps = args.max_steps if args.max_steps > 0 else np.inf global_step = 0 total_samples = 0 begin_infer = time.time() with torch.no_grad(): if args.do_eval: final_loss = 0.0 # for data_file in files: dllogger.log(step="Opening ", data={"file": data_file}) dataset = pretraining_dataset( input_file=data_file, max_pred_length=args.max_predictions_per_seq) if not multi_gpu_training: train_sampler = RandomSampler(dataset) datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True) else: train_sampler = DistributedSampler(dataset) datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True) for step, batch in enumerate( tqdm(datasetloader, desc="Iteration")): if global_step > max_steps: break batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch #\ loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels) final_loss += loss.item() global_step += 1 total_samples += len(datasetloader) torch.cuda.empty_cache() if global_step > max_steps: break final_loss /= global_step if multi_gpu_training: final_loss = torch.tensor(final_loss, device=device) dist.all_reduce(final_loss) final_loss /= torch.distributed.get_world_size() if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)): dllogger.log(step="Inference Loss", data={"final_loss": final_loss.item()}) else: # inference # if multi_gpu_training: # torch.distributed.barrier() # start_t0 = time.time() for data_file in files: dllogger.log(step="Opening ", data={"file": data_file}) dataset = pretraining_dataset( input_file=data_file, max_pred_length=args.max_predictions_per_seq) if not multi_gpu_training: train_sampler = RandomSampler(dataset) datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True) else: train_sampler = DistributedSampler(dataset) datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True) for step, batch in enumerate( tqdm(datasetloader, desc="Iteration")): if global_step > max_steps: break batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch #\ lm_logits, nsp_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=None, next_sentence_label=None) nb_instances += input_ids.size(0) global_step += 1 total_samples += len(datasetloader) torch.cuda.empty_cache() if global_step > max_steps: break # if multi_gpu_training: # torch.distributed.barrier() if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)): dllogger.log(step="Done Inferring on samples", data={}) end_infer = time.time() dllogger.log(step="Inference perf", data={ "inference_sequences_per_second": total_samples * args.eval_batch_size / (end_infer - begin_infer) })
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=os.getenv('LOCAL_RANK', 0)) parser.add_argument("--max_steps", type=int, default=0, help="Override number of training steps in the config") parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument("--fp16", help="Mixed precision training", action="store_true") parser.add_argument("--amp", help="Mixed precision training", action="store_true") parser.add_argument('--skip_checkpoint', default=False, action='store_true', help="Whether to save checkpoints") parser.add_argument( "--json-summary", help="Out file for DLLogger", default="dllogger.out", type=str, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() args.fp16 = args.fp16 or args.amp num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) # Redundant option - Override config parameter with command line input if args.max_steps > 0: cfg.SOLVER.MAX_ITER = args.max_steps if args.skip_checkpoint: cfg.SAVE_CHECKPOINT = False cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) dllogger.log(step="PARAMETER", data={"gpu_count": num_gpus}) # dllogger.log(step="PARAMETER", data={"environment_info": collect_env_info()}) dllogger.log(step="PARAMETER", data={"config_file": args.config_file}) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() dllogger.log(step="PARAMETER", data={"config": cfg}) if args.fp16: fp16 = True else: fp16 = False model, iters_per_epoch = train(cfg, args.local_rank, args.distributed, fp16, dllogger) if not args.skip_test: if not cfg.PER_EPOCH_EVAL: test_model(cfg, model, args.distributed, iters_per_epoch, dllogger)
def main(args): args.fp16 = args.fp16 or args.amp if args.server_ip and args.server_port: # Distant debugging - see # https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd logger.info("Waiting for debugger attach") ptvsd.enable_attach( address=(args.server_ip, args.server_port), redirect_output=True, ) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of # sychronizing nodes/GPUs. if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, " "16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16, )) if not args.do_train and not args.do_eval and not args.do_predict: raise ValueError("At least one of `do_train`, `do_eval` or " "`do_predict` must be True.") if is_main_process(): if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train): logger.warning("Output directory ({}) already exists and is not " "empty.".format(args.output_dir)) mkdir_by_main_process(args.output_dir) if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend( verbosity=dllogger.Verbosity.VERBOSE, filename=os.path.join(args.output_dir, 'dllogger.json'), ), dllogger.StdOutBackend( verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step, ), ]) else: dllogger.init(backends=[]) dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( args.gradient_accumulation_steps)) if args.gradient_accumulation_steps > args.train_batch_size: raise ValueError("gradient_accumulation_steps ({}) cannot be larger " "train_batch_size ({}) - there cannot be a fraction " "of one sample.".format( args.gradient_accumulation_steps, args.train_batch_size, )) args.train_batch_size = (args.train_batch_size // args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) dllogger.log(step="PARAMETER", data={"SEED": args.seed}) processor = PROCESSORS[args.task_name]() num_labels = len(processor.get_labels()) #tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer( args.vocab_file, do_lower_case=args.do_lower_case, max_len=512, ) # for bert large num_train_optimization_steps = None if args.do_train: train_features = get_train_features( args.data_dir, args.bert_model, args.max_seq_length, args.do_lower_case, args.local_rank, args.train_batch_size, args.gradient_accumulation_steps, args.num_train_epochs, tokenizer, processor, ) num_train_optimization_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = (num_train_optimization_steps // torch.distributed.get_world_size()) # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) # modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training model = modeling.BertForSequenceClassification( config, num_labels=num_labels, ) logger.info("USING CHECKPOINT from {}".format(args.init_checkpoint)) checkpoint = torch.load(args.init_checkpoint, map_location='cpu') checkpoint = checkpoint["model"] if "model" in checkpoint.keys( ) else checkpoint model.load_state_dict(checkpoint, strict=False) logger.info("USED CHECKPOINT from {}".format(args.init_checkpoint)) dllogger.log( step="PARAMETER", data={ "num_parameters": sum([p.numel() for p in model.parameters() if p.requires_grad]), }, ) model.to(device) # Prepare optimizer model, optimizer, scheduler = init_optimizer_and_amp( model, args.learning_rate, args.loss_scale, args.warmup_proportion, num_train_optimization_steps, args.fp16, ) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex to use " "distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) loss_fct = torch.nn.CrossEntropyLoss() results = {} if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = gen_tensor_dataset(train_features) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size, ) global_step = 0 nb_tr_steps = 0 tr_loss = 0 latency_train = 0.0 nb_tr_examples = 0 model.train() tic_train = time.perf_counter() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask) loss = loss_fct( logits.view(-1, num_labels), label_ids.view(-1), ) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up for BERT # which FusedAdam doesn't do scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 latency_train = time.perf_counter() - tic_train tr_loss = tr_loss / nb_tr_steps results.update({ 'global_step': global_step, 'train:loss': tr_loss, 'train:latency': latency_train, 'train:num_samples_per_gpu': nb_tr_examples, 'train:num_steps': nb_tr_steps, 'train:throughput': get_world_size() * nb_tr_examples / latency_train, }) if is_main_process() and not args.skip_checkpoint: model_to_save = model.module if hasattr(model, 'module') else model torch.save( {"model": model_to_save.state_dict()}, os.path.join(args.output_dir, modeling.WEIGHTS_NAME), ) with open( os.path.join(args.output_dir, modeling.CONFIG_NAME), 'w', ) as f: f.write(model_to_save.config.to_json_string()) if (args.do_eval or args.do_predict) and is_main_process(): eval_examples = processor.get_dev_examples(args.data_dir) eval_features, label_map = convert_examples_to_features( eval_examples, processor.get_labels(), args.max_seq_length, tokenizer, ) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = gen_tensor_dataset(eval_features) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, ) model.eval() preds = None out_label_ids = None eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 cuda_events = [(torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)) for _ in range(len(eval_dataloader))] for i, (input_ids, input_mask, segment_ids, label_ids) in tqdm( enumerate(eval_dataloader), desc="Evaluating", ): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): cuda_events[i][0].record() logits = model(input_ids, segment_ids, input_mask) cuda_events[i][1].record() if args.do_eval: eval_loss += loss_fct( logits.view(-1, num_labels), label_ids.view(-1), ).mean().item() nb_eval_steps += 1 nb_eval_examples += input_ids.size(0) if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = label_ids.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0, ) torch.cuda.synchronize() eval_latencies = [ event_start.elapsed_time(event_end) for event_start, event_end in cuda_events ] eval_latencies = list(sorted(eval_latencies)) def infer_latency_sli(threshold): index = int(len(eval_latencies) * threshold) - 1 index = min(max(index, 0), len(eval_latencies) - 1) return eval_latencies[index] eval_throughput = (args.eval_batch_size / (np.mean(eval_latencies) / 1000)) results.update({ 'eval:num_samples_per_gpu': nb_eval_examples, 'eval:num_steps': nb_eval_steps, 'infer:latency(ms):50%': infer_latency_sli(0.5), 'infer:latency(ms):90%': infer_latency_sli(0.9), 'infer:latency(ms):95%': infer_latency_sli(0.95), 'infer:latency(ms):99%': infer_latency_sli(0.99), 'infer:latency(ms):100%': infer_latency_sli(1.0), 'infer:latency(ms):avg': np.mean(eval_latencies), 'infer:latency(ms):std': np.std(eval_latencies), 'infer:latency(ms):sum': np.sum(eval_latencies), 'infer:throughput(samples/s):avg': eval_throughput, }) preds = np.argmax(preds, axis=1) if args.do_predict: dump_predictions( os.path.join(args.output_dir, 'predictions.json'), label_map, preds, eval_examples, ) if args.do_eval: results['eval:loss'] = eval_loss / nb_eval_steps eval_result = compute_metrics(args.task_name, preds, out_label_ids) results.update(eval_result) if is_main_process(): logger.info("***** Results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) with open(os.path.join(args.output_dir, "results.txt"), "w") as writer: json.dump(results, writer) dllogger_queries_from_results = { 'exact_match': 'acc', 'F1': 'f1', 'e2e_train_time': 'train:latency', 'training_sequences_per_second': 'train:throughput', 'e2e_inference_time': ('infer:latency(ms):sum', lambda x: x / 1000), 'inference_sequences_per_second': 'infer:throughput(samples/s):avg', } for key, query in dllogger_queries_from_results.items(): results_key, convert = (query if isinstance(query, tuple) else (query, lambda x: x)) if results_key not in results: continue dllogger.log( step=tuple(), data={key: convert(results[results_key])}, ) dllogger.flush() return results
def main(): args = parse_args() if args.amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) dllogger.log(data=vars(args), step='PARAMETER') batch_sizes = args.batch_sizes.split(',') batch_sizes = [int(s) for s in batch_sizes] result_data = {} for batch_size in batch_sizes: print('Benchmarking batch size', batch_size) tf.reset_default_graph() # Input tensors users = tf.placeholder(tf.int32, shape=(None, )) items = tf.placeholder(tf.int32, shape=(None, )) dropout = tf.placeholder_with_default(0.0, shape=()) # Model ops and saver logits_op = ncf_model_ops(users=users, items=items, labels=None, dup_mask=None, mode='INFERENCE', params={ 'fp16': False, 'val_batch_size': batch_size, 'num_users': args.n_users, 'num_items': args.n_items, 'num_factors': args.factors, 'mf_reg': 0, 'layer_sizes': args.layers, 'layer_regs': [0. for i in args.layers], 'dropout': 0.0, 'sigmoid': True, 'top_k': None, 'learning_rate': None, 'beta_1': None, 'beta_2': None, 'epsilon': None, 'loss_scale': None, }) config = tf.ConfigProto() config.gpu_options.allow_growth = True if args.xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) saver = tf.train.Saver() if args.load_checkpoint_path: saver.restore(sess, args.load_checkpoint_path) else: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) users_batch = np.random.randint(size=batch_size, low=0, high=args.n_users) items_batch = np.random.randint(size=batch_size, low=0, high=args.n_items) latencies = [] for i in range(args.num_batches): start = time.time() _ = sess.run(logits_op, feed_dict={ users: users_batch, items: items_batch, dropout: 0.0 }) end = time.time() if i < 10: # warmup iterations continue latencies.append(end - start) result_data[ f'batch_{batch_size}_mean_throughput'] = batch_size / np.mean( latencies) result_data[f'batch_{batch_size}_mean_latency'] = np.mean(latencies) result_data[f'batch_{batch_size}_p90_latency'] = np.percentile( latencies, 90) result_data[f'batch_{batch_size}_p95_latency'] = np.percentile( latencies, 95) result_data[f'batch_{batch_size}_p99_latency'] = np.percentile( latencies, 99) dllogger.log(data=result_data, step=tuple()) dllogger.flush()
def main(): args = parse_args() print("init distributed") init_distributed(args) if args.rank == 0: wandb.init() dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') torch.manual_seed(1) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_negs = torch.load(args.data + '/test_negatives.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) valid_negative = test_negs.shape[1] nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data( test_ratings, test_negs, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss(reduction='none').cuda( ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU if args.distributed: model = DDP(model, device_ids=[args.local_rank]) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(utils.count_parameters(model))) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return max_hr = 0 best_epoch = 0 train_throughputs, eval_throughputs = [], [] for epoch in range(args.epochs): begin = time.time() epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data( train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) loss.backward() optimizer.step() if args.rank == 0: wandb.log({"Test loss": loss}) for p in model.parameters(): p.grad = None del epoch_users, epoch_items, epoch_label train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if args.rank == 0: wandb.log({"Test hit rate": hr}) wandb.log({"Test train epoch time": train_time}) wandb.log({"Test train throughput": train_throughput}) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch # save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("New best hr!") # torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple())
glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))) if checkpoints: model.hparams.test_checkpoint = checkpoints[-1] trainer.resume_from_checkpoint = checkpoints[-1] trainer.logger.log_hyperparams(model.hparams) trainer.test() return model if __name__ == "__main__": parser = argparse.ArgumentParser() parser = pl.Trainer.add_argparse_args(parser) parser = SummarizationModule.add_model_specific_args(parser, os.getcwd()) args = parser.parse_args() if get_rank() == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) main(args) dllogger.flush()
def init_dllogger(log_dir): Logger.init([ Logger.StdOutBackend(Logger.Verbosity.DEFAULT, step_format=format_step), Logger.JSONStreamBackend(Logger.Verbosity.VERBOSE, log_dir) ])
def run_generate(verbose=True): """ Takes input text, generates output, and then using reference calculates the BLEU scores. The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed. Args: verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout Returns: a tuple: ``(scores, params}`` - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}`` - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}`` """ parser = argparse.ArgumentParser() parser.add_argument("model_path", type=str, help="like facebook/bart-large-cnn or path to ckpt") parser.add_argument("config_path", type=str, help="path to config") parser.add_argument("data_dir", type=str, help="like cnn_dm/test.source") parser.add_argument("save_path", type=str, help="where to save summaries") parser.add_argument("--type_path", type=str, required=False, default="test", help="like cnn_dm/test.target") parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.") parser.add_argument("--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples") parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics") parser.add_argument("--bs", type=int, default=8, required=False, help="batch size") parser.add_argument("--n_obs", type=int, default=None, required=False, help="How many observations. Defaults to all.") parser.add_argument("--num_return_sequences", type=int, default=1, required=False, help="How many sequences to return") parser.add_argument("--fp16", action="store_true") parser.add_argument("--dump-args", action="store_true", help="print the custom hparams with the results") parser.add_argument( "--info", nargs="?", type=str, const=datetime_now(), help= "use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g. lang=en-ru. If no value is passed, the current datetime string will be used.", ) parser.add_argument("--eval_max_gen_length", type=int, default=None, help="never generate more than n tokens") parser.add_argument( "--eval_beams", type=int, default=None, required=False, help="# beams to use. 0 corresponds to not using beam search.") parser.add_argument( "--max_source_length", default=1024, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=142, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--sync_timeout", type=int, default=600, required=False, help= "How long should master process wait for other processes to finish.", ) parser.add_argument("--debug", action="store_true") parser.add_argument('--json-summary', type=str, default="results/dllogger.json", help='If provided, the json summary will be written to' 'the specified file.') parser.add_argument( '--distill', type=str, default=None, help="string indicating how model is distilled, only sft supported", choices=["sft", None]) parser.add_argument( '--layers', type=str, default=None, help= "string indicating which teacher layers remain, split by '-' (ex. 0-6-11)" ) parser.add_argument('--do_encoder', action="store_true", default=False, help="if true encoder distilled") parser.add_argument('--do_decoder', action="store_true", default=False, help="if true decoder distilled") dist = parser.add_argument_group('distributed setup') dist.add_argument('--local_rank', type=int, default=os.getenv('LOCAL_RANK', 0), help='Used for multi-process training.') start_time = time.time() # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate args, rest = parser.parse_known_args() parsed_args = parse_numeric_n_bool_cl_kwargs(rest) if args.local_rank <= 0: print(args) print(rest) # Initialize device and distributed backend utils.distributed_utils.init_distributed(args.device == "cuda") if utils.distributed_utils.get_world_size() > 1: utils.distributed_utils.set_affinity(args.local_rank) torch.cuda.set_device(args.local_rank) if Path(args.json_summary).exists(): warnings.warn( f"json_summary {args.json_summary} will be overwritten unless you type ctrl-c." ) if utils.distributed_utils.get_rank() == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) if parsed_args and verbose: print(f"parsed the following generate kwargs: {parsed_args}") Path(args.save_path).parent.mkdir(exist_ok=True) json_save_path = Path(args.save_path + "/tmp") Path(json_save_path).mkdir(exist_ok=True) # this handles locking. if args.layers: num_layers = len(args.layers.split('-')) else: num_layers = None results, num_replicas, runtime_metrics = generate_summaries_or_translations( args.data_dir, json_save_path, args.model_path, args.config_path, batch_size=args.bs, device=args.device, fp16=args.fp16, task=args.task, prefix=args.prefix, eval_beams=args.eval_beams, max_source_length=args.max_source_length, max_target_length=args.max_target_length, eval_max_gen_length=args.eval_max_gen_length, n_obs=args.n_obs, type_path=args.type_path, num_return_sequences=args.num_return_sequences, distill=args.distill, num_layers=num_layers, do_encoder=args.do_encoder, do_decoder=args.do_decoder, **parsed_args, ) if args.local_rank <= 0: save_path = Path(args.save_path) save_path.mkdir(exist_ok=True) partial_results = gather_results_from_each_node( num_replicas, json_save_path, args.sync_timeout) preds, time_list = combine_partial_results(partial_results) if args.num_return_sequences > 1: save_path = save_path.joinpath("pseudolabel_results.json") print( f"Saving aggregated results at {save_path}, intermediate in {json_save_path}/" ) save_json(preds, save_path) return tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target") labels = [x.rstrip() for x in open(tgt_file).readlines()][:len(preds)] # Calculate metrics, save metrics, and save _generations.txt calc_bleu = "translation" in args.task score_fn = calculate_bleu if calc_bleu else calculate_rouge metric_name = "bleu" if calc_bleu else "rouge" metrics: Dict = score_fn(preds, labels) metrics["n_obs"] = len(preds) runtime = time.time() - start_time metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4) metrics["n_gpus"] = num_replicas metrics.update(runtime_metrics) time_list.sort() metrics["inference_latency_mean"] = np.mean(time_list) metrics["inference_latency_conf_50"] = max( time_list[:int(len(time_list) * 0.50)]) metrics["inference_latency_conf_90"] = max( time_list[:int(len(time_list) * 0.90)]) metrics["inference_latency_conf_95"] = max( time_list[:int(len(time_list) * 0.95)]) metrics["inference_latency_conf_99"] = max( time_list[:int(len(time_list) * 0.99)]) metrics["inference_latency_conf_100"] = max( time_list[:int(len(time_list) * 1)]) metrics["inference_throughput_mean"] = len(preds) * 1.0 / sum( time_list) metrics_save_path = save_path.joinpath( f"{args.type_path}_{metric_name}.json") save_json(metrics, metrics_save_path, indent=None) dllogger.log(step=tuple(), data=metrics) print(metrics) write_txt_file(preds, save_path.joinpath(f"{args.type_path}_generations.txt")) if args.debug: write_txt_file(labels, save_path.joinpath(f"{args.type_path}.target")) else: shutil.rmtree(json_save_path) dllogger.flush()
def setup_training(args): #assert (torch.cuda.is_available()) if args.use_habana: sys.path.append( os.path.realpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../../common"))) from library_loader import load_habana_module load_habana_module() device = torch.device("habana") if args.hmp: print(args.hmp_bf16) from hmp import hmp hmp.convert(opt_level=args.hmp_opt_level, bf16_file_path=args.hmp_bf16, fp32_file_path=args.hmp_fp32, isVerbose=args.hmp_verbose) if args.use_jit_trace: enable_tracing() args.n_pu = 1 args.allreduce_post_accumulation = False args.allreduce_post_accumulation_fp16 = False if args.local_rank != -1: if os.getenv('HCL_CONFIG_PATH') is None: print("HCL_CONFIG_PATH is not set") exit(0) os.environ["ID"] = str(args.local_rank) args.world_size = int(os.environ["WORLD_SIZE"]) args.rank = int(os.environ["RANK"]) torch.distributed.init_process_group('hcl', rank=args.rank, world_size=args.world_size) elif args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if device == torch.device("cuda"): args.n_pu = torch.cuda.device_count() else: args.n_pu = 1 args.allreduce_post_accumulation = False args.allreduce_post_accumulation_fp16 = False else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', init_method='env://') args.n_pu = 1 if args.gradient_accumulation_steps == 1: args.allreduce_post_accumulation = False args.allreduce_post_accumulation_fp16 = False if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) print( "device: {} n_pu: {}, distributed training: {}, 16-bits training: {}". format(device, args.n_pu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if args.train_batch_size % args.gradient_accumulation_steps != 0: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible" .format(args.gradient_accumulation_steps, args.train_batch_size)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.do_train: raise ValueError(" `do_train` must be True.") if not args.resume_from_checkpoint and os.path.exists( args.output_dir) and (os.listdir(args.output_dir) and any( [i.startswith('ckpt') for i in os.listdir(args.output_dir)])): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process(): os.makedirs(args.output_dir, exist_ok=True) return device, args
def main(): script_start = time.time() hvd_init() mpi_comm = MPI.COMM_WORLD args = parse_args() if hvd.rank() == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') if args.seed is not None: tf.random.set_random_seed(args.seed) np.random.seed(args.seed) cp.random.seed(args.seed) if args.amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" if "TF_ENABLE_AUTO_MIXED_PRECISION" in os.environ \ and os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] == "1": args.fp16 = False if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '': os.makedirs(args.checkpoint_dir, exist_ok=True) final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt') # Load converted data and get statistics train_df = pd.read_pickle(args.data + '/train_ratings.pickle') test_df = pd.read_pickle(args.data + '/test_ratings.pickle') nb_users, nb_items = train_df.max() + 1 # Extract train and test feature tensors from dataframe pos_train_users = train_df.iloc[:, 0].values.astype(np.int32) pos_train_items = train_df.iloc[:, 1].values.astype(np.int32) pos_test_users = test_df.iloc[:, 0].values.astype(np.int32) pos_test_items = test_df.iloc[:, 1].values.astype(np.int32) # Negatives indicator for negatives generation neg_mat = np.ones((nb_users, nb_items), dtype=np.bool) neg_mat[pos_train_users, pos_train_items] = 0 # Get the local training/test data train_users, train_items, train_labels = get_local_train_data( pos_train_users, pos_train_items, args.negative_samples) test_users, test_items = get_local_test_data(pos_test_users, pos_test_items) # Create and run Data Generator in a separate thread data_generator = DataGenerator( args.seed, hvd.rank(), nb_users, nb_items, neg_mat, train_users, train_items, train_labels, args.batch_size // hvd.size(), args.negative_samples, test_users, test_items, args.valid_users_per_batch, args.valid_negative, ) # Create tensorflow session and saver config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) if args.xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) # Input tensors users = tf.placeholder(tf.int32, shape=(None, )) items = tf.placeholder(tf.int32, shape=(None, )) labels = tf.placeholder(tf.int32, shape=(None, )) is_dup = tf.placeholder(tf.float32, shape=(None, )) dropout = tf.placeholder_with_default(args.dropout, shape=()) # Model ops and saver hit_rate, ndcg, eval_op, train_op = ncf_model_ops( users, items, labels, is_dup, params={ 'fp16': args.fp16, 'val_batch_size': args.valid_negative + 1, 'top_k': args.topk, 'learning_rate': args.learning_rate, 'beta_1': args.beta1, 'beta_2': args.beta2, 'epsilon': args.eps, 'num_users': nb_users, 'num_items': nb_items, 'num_factors': args.factors, 'mf_reg': 0, 'layer_sizes': args.layers, 'layer_regs': [0. for i in args.layers], 'dropout': dropout, 'sigmoid': True, 'loss_scale': args.loss_scale }, mode='TRAIN' if args.mode == 'train' else 'EVAL') saver = tf.train.Saver() # Accuracy metric tensors hr_sum = tf.get_default_graph().get_tensor_by_name( 'neumf/hit_rate/total:0') hr_cnt = tf.get_default_graph().get_tensor_by_name( 'neumf/hit_rate/count:0') ndcg_sum = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/total:0') ndcg_cnt = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/count:0') # Prepare evaluation data data_generator.prepare_eval_data() if args.load_checkpoint_path: saver.restore(sess, args.load_checkpoint_path) else: # Manual initialize weights sess.run(tf.global_variables_initializer()) # If test mode, run one eval if args.mode == 'test': sess.run(tf.local_variables_initializer()) eval_start = time.time() for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run(eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup: dup_batch, dropout: 0.0 }) eval_duration = time.time() - eval_start # Report results hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False)) hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False)) ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False)) ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False)) hit_rate = hit_rate_sum / hit_rate_cnt ndcg = ndcg_sum / ndcg_cnt if hvd.rank() == 0: eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration dllogger.log(step=tuple(), data={ 'eval_throughput': eval_throughput, 'eval_time': eval_duration, 'hr@10': hit_rate, 'ndcg': ndcg }) return # Performance Metrics train_times = list() eval_times = list() # Accuracy Metrics first_to_target = None time_to_train = 0.0 best_hr = 0 best_epoch = 0 # Buffers for global metrics global_hr_sum = np.ones(1) global_hr_count = np.ones(1) global_ndcg_sum = np.ones(1) global_ndcg_count = np.ones(1) # Buffers for local metrics local_hr_sum = np.ones(1) local_hr_count = np.ones(1) local_ndcg_sum = np.ones(1) local_ndcg_count = np.ones(1) # Begin training begin_train = time.time() for epoch in range(args.epochs): # Train for one epoch train_start = time.time() data_generator.prepare_train_data() for user_batch, item_batch, label_batch \ in zip(data_generator.train_users_batches, data_generator.train_items_batches, data_generator.train_labels_batches): sess.run(train_op, feed_dict={ users: user_batch.get(), items: item_batch.get(), labels: label_batch.get() }) train_duration = time.time() - train_start # Only log "warm" epochs if epoch >= 1: train_times.append(train_duration) # Evaluate if epoch > args.eval_after: eval_start = time.time() sess.run(tf.local_variables_initializer()) for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run(eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup: dup_batch, dropout: 0.0 }) # Compute local metrics local_hr_sum[0] = sess.run(hr_sum) local_hr_count[0] = sess.run(hr_cnt) local_ndcg_sum[0] = sess.run(ndcg_sum) local_ndcg_count[0] = sess.run(ndcg_cnt) # Reduce metrics across all workers mpi_comm.Reduce(local_hr_count, global_hr_count) mpi_comm.Reduce(local_hr_sum, global_hr_sum) mpi_comm.Reduce(local_ndcg_count, global_ndcg_count) mpi_comm.Reduce(local_ndcg_sum, global_ndcg_sum) # Calculate metrics hit_rate = global_hr_sum[0] / global_hr_count[0] ndcg = global_ndcg_sum[0] / global_ndcg_count[0] eval_duration = time.time() - eval_start # Only log "warm" epochs if epoch >= 1: eval_times.append(eval_duration) if hvd.rank() == 0: dllogger.log(step=(epoch, ), data={ 'train_time': train_duration, 'eval_time': eval_duration, 'hr@10': hit_rate, 'ndcg': ndcg }) # Update summary metrics if hit_rate > args.target and first_to_target is None: first_to_target = epoch time_to_train = time.time() - begin_train if hit_rate > best_hr: best_hr = hit_rate best_epoch = epoch time_to_best = time.time() - begin_train if hit_rate > args.target: saver.save(sess, final_checkpoint_path) # Final Summary if hvd.rank() == 0: train_times = np.array(train_times) train_throughputs = pos_train_users.shape[0] * (args.negative_samples + 1) / train_times eval_times = np.array(eval_times) eval_throughputs = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_times dllogger.log(step=tuple(), data={ 'average_train_time_per_epoch': np.mean(train_times), 'average_train_throughput': np.mean(train_throughputs), 'average_eval_time_per_epoch': np.mean(eval_times), 'average_eval_throughput': np.mean(eval_throughputs), 'first_epoch_to_hit': first_to_target, 'time_to_train': time_to_train, 'time_to_best': time_to_best, 'best_hr': best_hr, 'best_epoch': best_epoch }) dllogger.flush() sess.close() return
def main(args, model_args): exp_start_time = time.time() global best_prec1 best_prec1 = 0 args.distributed = False if "WORLD_SIZE" in os.environ: args.distributed = int(os.environ["WORLD_SIZE"]) > 1 args.local_rank = int(os.environ["LOCAL_RANK"]) else: args.local_rank = 0 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend="nccl", init_method="env://") args.world_size = torch.distributed.get_world_size() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) np.random.seed(seed=args.seed + args.local_rank) random.seed(args.seed + args.local_rank) def _worker_init_fn(id): np.random.seed(seed=args.seed + args.local_rank + id) random.seed(args.seed + args.local_rank + id) else: def _worker_init_fn(id): pass if args.static_loss_scale != 1.0: if not args.amp: print( "Warning: if --amp is not used, static_loss_scale will be ignored." ) if args.optimizer_batch_size < 0: batch_size_multiplier = 1 else: tbs = args.world_size * args.batch_size if args.optimizer_batch_size % tbs != 0: print( "Warning: simulated batch size {} is not divisible by actual batch size {}" .format(args.optimizer_batch_size, tbs)) batch_size_multiplier = int(args.optimizer_batch_size / tbs) print("BSM: {}".format(batch_size_multiplier)) start_epoch = 0 # optionally resume from a checkpoint if args.resume is not None: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) start_epoch = checkpoint["epoch"] best_prec1 = checkpoint["best_prec1"] model_state = checkpoint["state_dict"] optimizer_state = checkpoint["optimizer"] if "state_dict_ema" in checkpoint: model_state_ema = checkpoint["state_dict_ema"] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint["epoch"])) if start_epoch >= args.epochs: print( f"Launched training for {args.epochs}, checkpoint already run {start_epoch}" ) exit(1) else: print("=> no checkpoint found at '{}'".format(args.resume)) model_state = None model_state_ema = None optimizer_state = None else: model_state = None model_state_ema = None optimizer_state = None loss = nn.CrossEntropyLoss if args.mixup > 0.0: loss = lambda: NLLMultiLabelSmooth(args.label_smoothing) elif args.label_smoothing > 0.0: loss = lambda: LabelSmoothing(args.label_smoothing) memory_format = (torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format) model = available_models()[args.arch](**{ k: v if k != "pretrained" else v and ( not args.distributed or dist.get_rank() == 0) for k, v in model_args.__dict__.items() }) image_size = (args.image_size if args.image_size is not None else model.arch.default_image_size) model_and_loss = ModelAndLoss(model, loss, cuda=True, memory_format=memory_format) if args.use_ema is not None: model_ema = deepcopy(model_and_loss) ema = EMA(args.use_ema) else: model_ema = None ema = None # Create data loaders and optimizers as needed if args.data_backend == "pytorch": get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader elif args.data_backend == "dali-gpu": get_train_loader = get_dali_train_loader(dali_cpu=False) get_val_loader = get_dali_val_loader() elif args.data_backend == "dali-cpu": get_train_loader = get_dali_train_loader(dali_cpu=True) get_val_loader = get_dali_val_loader() elif args.data_backend == "syntetic": get_val_loader = get_syntetic_loader get_train_loader = get_syntetic_loader else: print("Bad databackend picked") exit(1) train_loader, train_loader_len = get_train_loader( args.data, image_size, args.batch_size, model_args.num_classes, args.mixup > 0.0, interpolation=args.interpolation, augmentation=args.augmentation, start_epoch=start_epoch, workers=args.workers, memory_format=memory_format, ) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, train_loader) val_loader, val_loader_len = get_val_loader( args.data, image_size, args.batch_size, model_args.num_classes, False, interpolation=args.interpolation, workers=args.workers, memory_format=memory_format, ) if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger = log.Logger( args.print_freq, [ dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT, step_format=log.format_step), dllogger.JSONStreamBackend( dllogger.Verbosity.VERBOSE, os.path.join(args.workspace, args.raport_file), ), ], start_epoch=start_epoch - 1, ) else: logger = log.Logger(args.print_freq, [], start_epoch=start_epoch - 1) logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT) logger.log_parameter( {f"model.{k}": v for k, v in model_args.__dict__.items()}, verbosity=dllogger.Verbosity.DEFAULT, ) optimizer = get_optimizer( list(model_and_loss.model.named_parameters()), args.lr, args=args, state=optimizer_state, ) if args.lr_schedule == "step": lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger) elif args.lr_schedule == "cosine": lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, end_lr=args.end_lr, logger=logger) elif args.lr_schedule == "linear": lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger) scaler = torch.cuda.amp.GradScaler( init_scale=args.static_loss_scale, growth_factor=2, backoff_factor=0.5, growth_interval=100 if args.dynamic_loss_scale else 1000000000, enabled=args.amp, ) if args.distributed: model_and_loss.distributed(args.gpu) model_and_loss.load_model_state(model_state) if (ema is not None) and (model_state_ema is not None): print("load ema") ema.load_state_dict(model_state_ema) train_loop( model_and_loss, optimizer, scaler, lr_policy, train_loader, val_loader, logger, should_backup_checkpoint(args), ema=ema, model_ema=model_ema, steps_per_epoch=train_loader_len, use_amp=args.amp, batch_size_multiplier=batch_size_multiplier, start_epoch=start_epoch, end_epoch=min((start_epoch + args.run_epochs), args.epochs) if args.run_epochs != -1 else args.epochs, early_stopping_patience=args.early_stopping_patience, best_prec1=best_prec1, prof=args.prof, skip_training=args.evaluate, skip_validation=args.training_only, save_checkpoints=args.save_checkpoints and not args.evaluate, checkpoint_dir=args.workspace, checkpoint_filename=args.checkpoint_filename, ) exp_duration = time.time() - exp_start_time if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger.end() print("Experiment ended")
def prepare_for_training(args, model_args, model_arch): args.distributed = False if "WORLD_SIZE" in os.environ: args.distributed = int(os.environ["WORLD_SIZE"]) > 1 args.local_rank = int(os.environ["LOCAL_RANK"]) else: args.local_rank = 0 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend="nccl", init_method="env://") args.world_size = torch.distributed.get_world_size() affinity = set_affinity(args.gpu, mode=args.gpu_affinity) print(f"Training process {args.local_rank} affinity: {affinity}") if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) np.random.seed(seed=args.seed + args.local_rank) random.seed(args.seed + args.local_rank) def _worker_init_fn(id): # Worker process should inherit its affinity from parent affinity = os.sched_getaffinity(0) print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}") np.random.seed(seed=args.seed + args.local_rank + id) random.seed(args.seed + args.local_rank + id) else: def _worker_init_fn(id): # Worker process should inherit its affinity from parent affinity = os.sched_getaffinity(0) print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}") if args.static_loss_scale != 1.0: if not args.amp: print("Warning: if --amp is not used, static_loss_scale will be ignored.") if args.optimizer_batch_size < 0: batch_size_multiplier = 1 else: tbs = args.world_size * args.batch_size if args.optimizer_batch_size % tbs != 0: print( "Warning: simulated batch size {} is not divisible by actual batch size {}".format( args.optimizer_batch_size, tbs ) ) batch_size_multiplier = int(args.optimizer_batch_size / tbs) print("BSM: {}".format(batch_size_multiplier)) start_epoch = 0 # optionally resume from a checkpoint if args.resume is not None: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu) ) start_epoch = checkpoint["epoch"] best_prec1 = checkpoint["best_prec1"] model_state = checkpoint["state_dict"] optimizer_state = checkpoint["optimizer"] if "state_dict_ema" in checkpoint: model_state_ema = checkpoint["state_dict_ema"] print( "=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint["epoch"] ) ) if start_epoch >= args.epochs: print( f"Launched training for {args.epochs}, checkpoint already run {start_epoch}" ) exit(1) else: print("=> no checkpoint found at '{}'".format(args.resume)) model_state = None model_state_ema = None optimizer_state = None else: model_state = None model_state_ema = None optimizer_state = None loss = nn.CrossEntropyLoss if args.mixup > 0.0: loss = lambda: NLLMultiLabelSmooth(args.label_smoothing) elif args.label_smoothing > 0.0: loss = lambda: LabelSmoothing(args.label_smoothing) memory_format = ( torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format ) model = model_arch( **{ k: v if k != "pretrained" else v and (not args.distributed or dist.get_rank() == 0) for k, v in model_args.__dict__.items() } ) image_size = ( args.image_size if args.image_size is not None else model.arch.default_image_size ) scaler = torch.cuda.amp.GradScaler( init_scale=args.static_loss_scale, growth_factor=2, backoff_factor=0.5, growth_interval=100 if args.dynamic_loss_scale else 1000000000, enabled=args.amp, ) executor = Executor( model, loss(), cuda=True, memory_format=memory_format, amp=args.amp, scaler=scaler, divide_loss=batch_size_multiplier, ts_script=args.jit == "script", ) # Create data loaders and optimizers as needed if args.data_backend == "pytorch": get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader elif args.data_backend == "dali-gpu": get_train_loader = get_dali_train_loader(dali_cpu=False) get_val_loader = get_dali_val_loader() elif args.data_backend == "dali-cpu": get_train_loader = get_dali_train_loader(dali_cpu=True) get_val_loader = get_dali_val_loader() elif args.data_backend == "syntetic": get_val_loader = get_syntetic_loader get_train_loader = get_syntetic_loader else: print("Bad databackend picked") exit(1) train_loader, train_loader_len = get_train_loader( args.data, image_size, args.batch_size, model_args.num_classes, args.mixup > 0.0, interpolation=args.interpolation, augmentation=args.augmentation, start_epoch=start_epoch, workers=args.workers, _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, ) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, train_loader) val_loader, val_loader_len = get_val_loader( args.data, image_size, args.batch_size, model_args.num_classes, False, interpolation=args.interpolation, workers=args.workers, _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, ) if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: logger = log.Logger( args.print_freq, [ dllogger.StdOutBackend( dllogger.Verbosity.DEFAULT, step_format=log.format_step ), dllogger.JSONStreamBackend( dllogger.Verbosity.VERBOSE, os.path.join(args.workspace, args.raport_file), ), ], start_epoch=start_epoch - 1, ) else: logger = log.Logger(args.print_freq, [], start_epoch=start_epoch - 1) logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT) logger.log_parameter( {f"model.{k}": v for k, v in model_args.__dict__.items()}, verbosity=dllogger.Verbosity.DEFAULT, ) optimizer = get_optimizer( list(executor.model.named_parameters()), args.lr, args=args, state=optimizer_state, ) if args.lr_schedule == "step": lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup) elif args.lr_schedule == "cosine": lr_policy = lr_cosine_policy( args.lr, args.warmup, args.epochs, end_lr=args.end_lr ) elif args.lr_schedule == "linear": lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs) if args.distributed: executor.distributed(args.gpu) if model_state is not None: executor.model.load_state_dict(model_state) trainer = Trainer( executor, optimizer, grad_acc_steps=batch_size_multiplier, ema=args.use_ema, ) if (args.use_ema is not None) and (model_state_ema is not None): trainer.ema_executor.model.load_state_dict(model_state_ema) return ( trainer, lr_policy, train_loader, train_loader_len, val_loader, logger, start_epoch, )
def main(FLAGS): if FLAGS.hvd: hvd.init() if hvd.local_rank() == 0: tf.logging.set_verbosity(tf.logging.INFO) log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename) os.makedirs(FLAGS.results_dir, exist_ok=True) dllogger.init(backends=[ dllogger.JSONStreamBackend( verbosity=dllogger.Verbosity.VERBOSE, filename=log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: tf.logging.set_verbosity(tf.logging.ERROR) dllogger.init(backends=[]) num_gpus = hvd.size() else: tf.logging.set_verbosity(tf.logging.INFO) log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename) os.makedirs(FLAGS.results_dir, exist_ok=True) dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) num_gpus = 1 dllogger.log(data=vars(FLAGS), step='PARAMETER') create_batches = FLAGS.batch_size // FLAGS.prebatch_size wide_columns, deep_columns = get_feature_columns( use_all_columns=FLAGS.use_all_columns) tf_transform_output = tft.TFTransformOutput( FLAGS.transformed_metadata_path) if not FLAGS.hvd or hvd.local_rank() == 0: tf.compat.v1.logging.warn('command line arguments: {}'.format( json.dumps(vars(FLAGS)))) if not os.path.exists(FLAGS.results_dir): os.mkdir(FLAGS.results_dir) with open('{}/args.json'.format(FLAGS.results_dir), 'w') as f: json.dump(vars(FLAGS), f, indent=4) if FLAGS.gpu: session_config = tf.compat.v1.ConfigProto( log_device_placement=FLAGS.log_device_placement) else: session_config = tf.compat.v1.ConfigProto( device_count={'GPU': 0}, log_device_placement=FLAGS.log_device_placement) if FLAGS.hvd: session_config.gpu_options.visible_device_list = str(hvd.local_rank()) if FLAGS.xla: session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 if FLAGS.benchmark: model_dir = None else: model_dir = FLAGS.model_dir if FLAGS.save_checkpoints_steps != 0: run_config = tf.estimator.RunConfig(model_dir=model_dir).replace( session_config=session_config, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=1) else: run_config = tf.estimator.RunConfig(model_dir=model_dir).replace( session_config=session_config, save_checkpoints_secs=FLAGS.save_checkpoints_secs, keep_checkpoint_max=1) wide_optimizer = tf.compat.v1.train.FtrlOptimizer( learning_rate=FLAGS.linear_learning_rate, l1_regularization_strength=FLAGS.linear_l1_regularization, l2_regularization_strength=FLAGS.linear_l2_regularization) deep_optimizer = tf.compat.v1.train.ProximalAdagradOptimizer( learning_rate=FLAGS.deep_learning_rate, initial_accumulator_value=0.1, l1_regularization_strength=FLAGS.deep_l1_regularization, l2_regularization_strength=FLAGS.deep_l2_regularization, use_locking=False) if FLAGS.hvd: wide_optimizer = hvd.DistributedOptimizer(wide_optimizer) deep_optimizer = hvd.DistributedOptimizer(deep_optimizer) stats_filename = os.path.join(FLAGS.transformed_metadata_path, 'stats.json') embed_columns = None # input functions to read data from disk train_input_fn = lambda: separate_input_fn( tf_transform_output, FLAGS.train_data_pattern, create_batches, tf.estimator.ModeKeys.TRAIN, reader_num_threads=FLAGS.reader_num_threads, parser_num_threads=FLAGS.parser_num_threads, shuffle_buffer_size=int(FLAGS.shuffle_percentage * create_batches), prefetch_buffer_size=FLAGS.prefetch_buffer_size, print_display_ids=FLAGS.print_display_ids) eval_input_fn = lambda: separate_input_fn( tf_transform_output, FLAGS.eval_data_pattern, (FLAGS.eval_batch_size // FLAGS.prebatch_size), tf.estimator.ModeKeys.EVAL, reader_num_threads=1, parser_num_threads=1, shuffle_buffer_size=int(FLAGS.shuffle_percentage * create_batches), prefetch_buffer_size=FLAGS.prefetch_buffer_size, print_display_ids=FLAGS.print_display_ids) estimator = construct_estimator(FLAGS.model_type, not FLAGS.canned_estimator, run_config, wide_columns, wide_optimizer, deep_columns, FLAGS.deep_hidden_units, FLAGS.deep_dropout, deep_optimizer, amp=FLAGS.amp) estimator = tf.estimator.add_metrics(estimator, map_custom_metric) estimator = tf.estimator.add_metrics(estimator, map_custom_metric_with_leak) steps_per_epoch = FLAGS.training_set_size / FLAGS.batch_size print('Steps per epoch: {}'.format(steps_per_epoch)) max_steps = int(FLAGS.num_epochs * steps_per_epoch) hooks = [] if FLAGS.hvd: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.predict or FLAGS.evaluate: # inference if FLAGS.benchmark: benchmark_hook = BenchmarkLoggingHook( global_batch_size=num_gpus * FLAGS.eval_batch_size, warmup_steps=FLAGS.benchmark_warmup_steps) hooks.append(benchmark_hook) eval_steps = FLAGS.benchmark_steps else: eval_steps = FLAGS.eval_steps predict_result_iter = estimator.predict(input_fn=eval_input_fn, hooks=hooks, yield_single_examples=False) results = [] for i, r in enumerate(predict_result_iter): print('predicting batch: ', i) results.append(r) # TODO: use eval_steps if i >= eval_steps - 1: break if FLAGS.benchmark: infer_throughput = benchmark_hook.mean_throughput.value() if FLAGS.benchmark: dllogger.log(data={'infer_throughput': infer_throughput}, step=tuple()) elif FLAGS.evaluate: print( 'evaluating using estimator.evaluate with eval_batch_size = ', FLAGS.eval_batch_size, ' and eval_steps = ', FLAGS.eval_steps) result = estimator.evaluate(eval_input_fn, hooks=hooks, steps=FLAGS.eval_steps) dllogger.log(step=(), data={ 'map_infer': float(result['map']), 'map_with_leak_infer': float(result['map_with_leak']) }) elif FLAGS.predict: scores = [r['probabilities'][:, 1] for r in results] scores = np.hstack(scores) scores_path = os.path.join(FLAGS.model_dir, 'scores.txt') print('saving the numpy scores array to: ', scores_path) np.savetxt(scores_path, scores, fmt="%f", delimiter='\n') else: # training if FLAGS.benchmark: benchmark_hook = BenchmarkLoggingHook( global_batch_size=num_gpus * FLAGS.batch_size, warmup_steps=FLAGS.benchmark_warmup_steps) hooks.append(benchmark_hook) estimator.train(train_input_fn, hooks=hooks, steps=FLAGS.benchmark_steps) train_throughput = benchmark_hook.mean_throughput.value() dllogger.log(data={'train_throughput': train_throughput}, step=tuple()) else: train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=max_steps, hooks=hooks) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, throttle_secs=FLAGS.eval_throttle_secs, steps=FLAGS.eval_steps) result = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if result: dllogger.log(step=(), data={ 'map': float(result[0]['map']), 'map_with_leak': float(result[0]['map_with_leak']) })
def main(): args = parse_args() init_distributed(args) if args.local_rank == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.metadata('train_throughput', { "name": 'train_throughput', 'format': ":.3e" }) dllogger.metadata('hr@10', {"name": 'hr@10', 'format': ":.5f"}) dllogger.metadata('train_epoch_time', { "name": 'train_epoch_time', 'format': ":.3f" }) dllogger.metadata('validation_epoch_time', { "name": 'validation_epoch_time', 'format': ":.3f" }) dllogger.metadata('eval_throughput', { "name": 'eval_throughput', 'format': ":.3e" }) dllogger.log(data=vars(args), step='PARAMETER') if args.seed is not None: torch.manual_seed(args.seed) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir: print("Saving results to {}".format(args.checkpoint_dir)) os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() feature_spec_path = os.path.join(args.data, args.feature_spec_file) feature_spec = FeatureSpec.from_yaml(feature_spec_path) trainset = dataloading.TorchTensorDataset(feature_spec, mapping_name='train', args=args) testset = dataloading.TorchTensorDataset(feature_spec, mapping_name='test', args=args) train_loader = dataloading.TrainDataloader(trainset, args) test_loader = dataloading.TestDataLoader(testset, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0] item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0] label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0] model = NeuMF( nb_users=feature_spec.feature_spec[user_feature_name]['cardinality'], nb_items=feature_spec.feature_spec[item_feature_name]['cardinality'], mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale='dynamic') if args.distributed: model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(utils.count_parameters(model))) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed) val_time = time.time() - start eval_size = test_loader.raw_dataset_length eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return # this should always be overridden if hr>0. # It is theoretically possible for the hit rate to be zero in the first epoch, which would result in referring # to an uninitialized variable. max_hr = 0 best_epoch = 0 best_model_timestamp = time.time() train_throughputs, eval_throughputs = [], [] for epoch in range(args.epochs): begin = time.time() batch_dict_list = train_loader.get_epoch_data() num_batches = len(batch_dict_list) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j batch_dict = batch_dict_list[batch_idx] user_features = batch_dict[USER_CHANNEL_NAME] item_features = batch_dict[ITEM_CHANNEL_NAME] user_batch = user_features[user_feature_name] item_batch = item_features[item_feature_name] label_features = batch_dict[LABEL_CHANNEL_NAME] label_batch = label_features[label_feature_name] outputs = model(user_batch, item_batch) loss = traced_criterion(outputs, label_batch.view(-1, 1)).float() loss = torch.mean(loss.view(-1), 0) if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() for p in model.parameters(): p.grad = None del batch_dict_list train_time = time.time() - begin begin = time.time() epoch_samples = train_loader.length_after_augmentation train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed) val_time = time.time() - begin eval_size = test_loader.raw_dataset_length eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch print("New best hr!") if args.checkpoint_dir: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple())
def main(): args = parse_args() hvd.init() set_affinity(hvd.local_rank()) if is_main_process(): log("Running total processes: {}".format(get_world_size())) log("Starting process: {}".format(get_rank())) if is_main_process(): dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)]) else: dllogger.init(backends=[]) tf.random.set_seed(args.seed) dllogger.log(step="PARAMETER", data={"SEED": args.seed}) # script parameters BATCH_SIZE = args.train_batch_size EVAL_BATCH_SIZE = args.predict_batch_size USE_XLA = args.xla USE_AMP = args.amp EPOCHS = args.num_train_epochs if not args.do_train: EPOCHS = args.num_train_epochs = 1 log("Since running inference only, setting args.num_train_epochs to 1") if not os.path.exists(args.output_dir) and is_main_process(): os.makedirs(args.output_dir) # TensorFlow configuration gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') tf.config.optimizer.set_jit(USE_XLA) #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) if args.amp: policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic") tf.keras.mixed_precision.experimental.set_policy(policy) print('Compute dtype: %s' % policy.compute_dtype) # Compute dtype: float16 print('Variable dtype: %s' % policy.variable_dtype) # Variable dtype: float32 if is_main_process(): log("***** Loading tokenizer and model *****") # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression) electra_model = args.electra_model config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir) config.update({"amp": args.amp}) if args.vocab_file is None: tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir) else: tokenizer = ElectraTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, cache_dir=args.cache_dir, args=args) if is_main_process(): log("***** Loading dataset *****") # Load data processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None if is_main_process(): log("***** Loading features *****") # Load cached features squad_version = '2.0' if args.version_2_with_negative else '1.1' if args.cache_dir is None: args.cache_dir = args.data_dir cached_train_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_train-v{4}.json_{1}_{2}_{3}'.format( electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), squad_version) cached_dev_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_dev-v{4}.json_{1}_{2}_{3}'.format( electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), squad_version) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) if args.do_train else [] with open(cached_dev_features_file, "rb") as reader: dev_features = pickle.load(reader) if args.do_predict else [] except: train_features = ( # TODO: (yy) do on rank 0? squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True, return_dataset="", ) if args.do_train else [] ) dev_features = ( squad_convert_examples_to_features( examples=dev_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False, return_dataset="", ) if args.do_predict else [] ) # Dump Cached features if not args.skip_cache and is_main_process(): if args.do_train: log("***** Building Cache Files: {} *****".format(cached_train_features_file)) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) if args.do_predict: log("***** Building Cache Files: {} *****".format(cached_dev_features_file)) with open(cached_dev_features_file, "wb") as writer: pickle.dump(dev_features, writer) len_train_features = len(train_features) total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1 train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1 len_dev_features = len(dev_features) total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1 train_dataset = get_dataset_from_features(train_features, BATCH_SIZE, v2=args.version_2_with_negative) if args.do_train else [] dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev", v2=args.version_2_with_negative) if args.do_predict else [] opt = create_optimizer(init_lr=args.learning_rate, num_train_steps=total_train_steps, num_warmup_steps=int(args.warmup_proportion * total_train_steps), weight_decay_rate=args.weight_decay_rate, layerwise_lr_decay=args.layerwise_lr_decay, n_transformer_layers=model.num_hidden_layers) if USE_AMP: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic") # Define loss function loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) loss_class = tf.keras.losses.BinaryCrossentropy( from_logits=True, name='binary_crossentropy' ) metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=opt, loss=loss, metrics=[metric]) train_loss_results = [] if args.do_train and is_main_process(): log("***** Running training *****") log(" Num examples = ", len_train_features) log(" Num Epochs = ", args.num_train_epochs) log(" Instantaneous batch size per GPU = ", args.train_batch_size) log( " Total train batch size (w. parallel, distributed & accumulation) = ", args.train_batch_size * get_world_size(), ) log(" Total optimization steps =", total_train_steps) total_train_time = 0 latency = [] for epoch in range(EPOCHS): if args.do_train: epoch_loss_avg = tf.keras.metrics.Mean() epoch_perf_avg = tf.keras.metrics.Mean() epoch_start = time.time() epoch_iterator = tqdm(train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5, disable=not is_main_process()) for iter, inputs in enumerate(epoch_iterator): # breaking criterion if max_steps if > 1 if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps: break iter_start = time.time() # Optimize the model loss_value = train_step(model, inputs, loss, USE_AMP, opt, (iter == 0 and epoch == 0), v2=args.version_2_with_negative, loss_class=loss_class, fp16=USE_AMP) epoch_perf_avg.update_state(1. * BATCH_SIZE / (time.time() - iter_start)) if iter % args.log_freq == 0: if is_main_process(): log("\nEpoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}, loss_scale:{}, opt_step:{}".format(epoch, iter, loss_value, epoch_perf_avg.result() * get_world_size(), opt.loss_scale if config.amp else 1, int(opt.iterations))) dllogger.log(step=(epoch, iter,), data={"step_loss": float(loss_value.numpy()), "train_perf": float( epoch_perf_avg.result().numpy() * get_world_size())}) # Track progress epoch_loss_avg.update_state(loss_value) # Add current batch loss # End epoch train_loss_results.append(epoch_loss_avg.result()) total_train_time += float(time.time() - epoch_start) # Summarize and save checkpoint at the end of each epoch if is_main_process(): dllogger.log(step=tuple(), data={"e2e_train_time": total_train_time, "training_sequences_per_second": float( epoch_perf_avg.result().numpy() * get_world_size()), "final_loss": float(epoch_loss_avg.result().numpy())}) if not args.skip_checkpoint: if args.ci: checkpoint_name = "{}/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.output_dir, args.version_2_with_negative, epoch + 1) else: checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.version_2_with_negative, epoch + 1) if is_main_process(): model.save_weights(checkpoint_name) if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1): if not args.do_train: log("***** Loading checkpoint: {} *****".format(args.init_checkpoint)) model.load_weights(args.init_checkpoint).expect_partial() current_feature_id = 0 all_results = [] if is_main_process(): log("***** Running evaluation *****") log(" Num Batches = ", total_dev_steps) log(" Batch size = ", args.predict_batch_size) raw_infer_start = time.time() if is_main_process(): infer_perf_avg = tf.keras.metrics.Mean() dev_iterator = tqdm(dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5, disable=not is_main_process()) for input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible in dev_iterator: # training=False is needed only if there are layers with different # behavior during training versus inference (e.g. Dropout). iter_start = time.time() if not args.joint_head: batch_start_logits, batch_end_logits = infer_step(model, input_ids, attention_mask=input_mask, token_type_ids=segment_ids, )[:2] #Synchronize with GPU to compute time _ = batch_start_logits.numpy() else: outputs = infer_step(model, input_ids, attention_mask=input_mask, token_type_ids=segment_ids, cls_index=cls_index, p_mask=p_mask, ) #Synchronize with GPU to compute time _ = outputs[0].numpy() infer_time = (time.time() - iter_start) infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time) latency.append(infer_time) for iter_ in range(input_ids.shape[0]): if not args.joint_head: start_logits = batch_start_logits[iter_].numpy().tolist() end_logits = batch_end_logits[iter_].numpy().tolist() dev_feature = dev_features[current_feature_id] current_feature_id += 1 unique_id = int(dev_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) else: dev_feature = dev_features[current_feature_id] current_feature_id += 1 unique_id = int(dev_feature.unique_id) output = [output[iter_].numpy().tolist() for output in outputs] start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) all_results.append(result) # Compute and save predictions answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") e2e_infer_time = time.time() - raw_infer_start # if args.version_2_with_negative: # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") # else: # output_null_log_odds_file = None with open(output_prediction_file, "w") as f: f.write(json.dumps(answers, indent=4) + "\n") with open(output_nbest_file, "w") as f: f.write(json.dumps(nbest_answers, indent=4) + "\n") if args.do_eval: if args.version_2_with_negative: dev_file = "dev-v2.0.json" else: dev_file = "dev-v1.1.json" eval_out = subprocess.check_output([sys.executable, args.eval_script, args.data_dir + "/" + dev_file, output_prediction_file]) log(eval_out.decode('UTF-8')) scores = str(eval_out).strip() exact_match = float(scores.split(":")[1].split(",")[0]) if args.version_2_with_negative: f1 = float(scores.split(":")[2].split(",")[0]) else: f1 = float(scores.split(":")[2].split("}")[0]) log("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode('UTF-8'))) log("**EVAL SUMMARY** - Epoch: {:03d}, EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s" .format(epoch, exact_match, f1, infer_perf_avg.result())) latency_all = sorted(latency)[:-2] log( "**LATENCY SUMMARY** - Epoch: {:03d}, Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms" .format(epoch, sum(latency_all) / len(latency_all) * 1000, sum(latency_all[:int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000, sum(latency_all[:int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000, sum(latency_all[:int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000, )) dllogger.log(step=tuple(), data={"inference_sequences_per_second": float(infer_perf_avg.result().numpy()), "e2e_inference_time": e2e_infer_time}) if is_main_process() and args.do_train and args.do_eval: log( "**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s" .format(exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(), infer_perf_avg.result())) dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
def main(): run = Run.get_context() workspace = run.experiment.workspace # First thing to do is try to set up from environment configure_nccl_settings_from_env() parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=os.getenv("LOCAL_RANK", 0)) parser.add_argument( "--max_steps", type=int, default=0, help="Override number of training steps in the config", ) parser.add_argument("--dataset", type=str, required=True) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument("--fp16", help="Mixed precision training", action="store_true") parser.add_argument("--amp", help="Mixed precision training", action="store_true") parser.add_argument( "--skip_checkpoint", default=False, action="store_true", help="Whether to save checkpoints", ) parser.add_argument( "--json-summary", help="Out file for DLLogger", default="dllogger.out", type=str, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() args.fp16 = args.fp16 or args.amp num_gpus = get_global_size() args.distributed = num_gpus > 1 args.local_rank = get_local_rank() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) # Redundant option - Override config parameter with command line input if args.max_steps > 0: cfg.SOLVER.MAX_ITER = args.max_steps if args.skip_checkpoint: cfg.SAVE_CHECKPOINT = False cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) if is_main_process(): dllogger.init( backends=[ dllogger.JSONStreamBackend( verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary ), dllogger.StdOutBackend( verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step ), ] ) else: dllogger.init(backends=[]) dllogger.log(step="PARAMETER", data={"gpu_count": num_gpus}) # dllogger.log(step="PARAMETER", data={"environment_info": collect_env_info()}) dllogger.log(step="PARAMETER", data={"config_file": args.config_file}) dllogger.log(step="PARAMETER", data={"config": cfg}) if args.fp16: fp16 = True else: fp16 = False if args.local_rank == 0: dllogger.log(step="WEIGHT DOWNLOAD", data={"complete": False}) download_weights(cfg.MODEL.WEIGHT, cfg.PATHS_CATALOG) dllogger.log(step="WEIGHT DOWNLOAD", data={"complete": True}) dllogger.log( step="DATASET MOUNT", data={"complete": False, "dataset": args.dataset} ) coco2017 = Dataset.get_by_name(workspace, args.dataset) cc2017mount = coco2017.mount("/data") cc2017mount.start() dllogger.log( step="DATASET MOUNT", data={"complete": True, "dataset": args.dataset} ) if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() model, iters_per_epoch = train( cfg, args.local_rank, args.distributed, fp16, dllogger )
def main(args): exp_start_time = time.time() global best_prec1 best_prec1 = 0 args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.local_rank = int(os.environ['LOCAL_RANK']) args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.amp and args.fp16: print("Please use only one of the --fp16/--amp flags") exit(1) if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) np.random.seed(seed=args.seed + args.local_rank) random.seed(args.seed + args.local_rank) def _worker_init_fn(id): np.random.seed(seed=args.seed + args.local_rank + id) random.seed(args.seed + args.local_rank + id) else: def _worker_init_fn(id): pass if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) if args.optimizer_batch_size < 0: batch_size_multiplier = 1 else: tbs = args.world_size * args.batch_size if args.optimizer_batch_size % tbs != 0: print( "Warning: simulated batch size {} is not divisible by actual batch size {}" .format(args.optimizer_batch_size, tbs)) batch_size_multiplier = int(args.optimizer_batch_size / tbs) print("BSM: {}".format(batch_size_multiplier)) pretrained_weights = None if args.pretrained_weights: if os.path.isfile(args.pretrained_weights): print("=> loading pretrained weights from '{}'".format( args.pretrained_weights)) pretrained_weights = torch.load(args.pretrained_weights) else: print("=> no pretrained weights found at '{}'".format(args.resume)) start_epoch = 0 # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model_state = checkpoint['state_dict'] optimizer_state = checkpoint['optimizer'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) model_state = None optimizer_state = None else: model_state = None optimizer_state = None loss = nn.CrossEntropyLoss if args.mixup > 0.0: loss = lambda: NLLMultiLabelSmooth(args.label_smoothing) elif args.label_smoothing > 0.0: loss = lambda: LabelSmoothing(args.label_smoothing) model_and_loss = ModelAndLoss((args.arch, args.model_config), loss, pretrained_weights=pretrained_weights, cuda=True, fp16=args.fp16) # Create data loaders and optimizers as needed if args.data_backend == 'pytorch': get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader elif args.data_backend == 'dali-gpu': get_train_loader = get_dali_train_loader(dali_cpu=False) get_val_loader = get_dali_val_loader() elif args.data_backend == 'dali-cpu': get_train_loader = get_dali_train_loader(dali_cpu=True) get_val_loader = get_dali_val_loader() elif args.data_backend == 'syntetic': get_val_loader = get_syntetic_loader get_train_loader = get_syntetic_loader train_loader, train_loader_len = get_train_loader(args.data, args.batch_size, 1000, args.mixup > 0.0, workers=args.workers, fp16=args.fp16) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, 1000, train_loader) val_loader, val_loader_len = get_val_loader(args.data, args.batch_size, 1000, False, workers=args.workers, fp16=args.fp16) if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger = log.Logger(args.print_freq, [ dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT, step_format=log.format_step), dllogger.JSONStreamBackend( dllogger.Verbosity.VERBOSE, os.path.join(args.workspace, args.raport_file)) ]) else: logger = log.Logger(args.print_freq, []) logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT) optimizer = get_optimizer(list(model_and_loss.model.named_parameters()), args.fp16, args.lr, args.momentum, args.weight_decay, nesterov=args.nesterov, bn_weight_decay=args.bn_weight_decay, state=optimizer_state, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) if args.lr_schedule == 'step': lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger) elif args.lr_schedule == 'cosine': lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=logger) elif args.lr_schedule == 'linear': lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger) if args.amp: model_and_loss, optimizer = amp.initialize( model_and_loss, optimizer, opt_level="O2", loss_scale="dynamic" if args.dynamic_loss_scale else args.static_loss_scale) if args.distributed: model_and_loss.distributed() model_and_loss.load_model_state(model_state) train_loop(model_and_loss, optimizer, lr_policy, train_loader, val_loader, args.epochs, args.fp16, logger, should_backup_checkpoint(args), use_amp=args.amp, batch_size_multiplier=batch_size_multiplier, start_epoch=start_epoch, best_prec1=best_prec1, prof=args.prof, skip_training=args.evaluate, skip_validation=args.training_only, save_checkpoints=args.save_checkpoints and not args.evaluate, checkpoint_dir=args.workspace) exp_duration = time.time() - exp_start_time if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger.end() print("Experiment ended")
def main(): hvd.init() parser = ArgumentParser(description="Train a Variational Autoencoder for Collaborative Filtering in TensorFlow") parser.add_argument('--train', action='store_true', help='Run training of VAE') parser.add_argument('--test', action='store_true', help='Run validation of VAE') parser.add_argument('--inference', action='store_true', help='Run inference on a single random example.' 'This can also be used to measure the latency for a batch size of 1') parser.add_argument('--inference_benchmark', action='store_true', help='Benchmark the inference throughput on a very large batch size') parser.add_argument('--use_tf_amp', action='store_true', help='Enable Automatic Mixed Precision') parser.add_argument('--epochs', type=int, default=400, help='Number of epochs to train') parser.add_argument('--batch_size_train', type=int, default=24576, help='Global batch size for training') parser.add_argument('--batch_size_validation', type=int, default=10000, help='Used both for validation and testing') parser.add_argument('--validation_step', type=int, default=50, help='Train epochs for one validation') parser.add_argument('--warm_up_epochs', type=int, default=5, help='Number of epochs to omit during benchmark') parser.add_argument('--total_anneal_steps', type=int, default=15000, help='Number of annealing steps') parser.add_argument('--anneal_cap', type=float, default=0.1, help='Annealing cap') parser.add_argument('--lam', type=float, default=1.00, help='Regularization parameter') parser.add_argument('--lr', type=float, default=0.004, help='Learning rate') parser.add_argument('--beta1', type=float, default=0.90, help='Adam beta1') parser.add_argument('--beta2', type=float, default=0.90, help='Adam beta2') parser.add_argument('--top_results', type=int, default=100, help='Number of results to be recommended') parser.add_argument('--xla', action='store_true', default=False, help='Enable XLA') parser.add_argument('--trace', action='store_true', default=False, help='Save profiling traces') parser.add_argument('--activation', type=str, default='tanh', help='Activation function') parser.add_argument('--log_path', type=str, default='./vae_cf.log', help='Path to the detailed training log to be created') parser.add_argument('--seed', type=int, default=0, help='Random seed for TensorFlow and numpy') parser.add_argument('--data_dir', default='/data', type=str, help='Directory for storing the training data') parser.add_argument('--checkpoint_dir', type=str, default=None, help='Path for saving a checkpoint after the training') args = parser.parse_args() if args.batch_size_train % hvd.size() != 0: raise ValueError('Global batch size should be a multiple of the number of workers') args.local_batch_size = args.batch_size_train // hvd.size() logger = logging.getLogger("VAE") if hvd.rank() == 0: logger.setLevel(logging.INFO) dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) else: dllogger.init(backends=[]) logger.setLevel(logging.ERROR) dllogger.log(data=vars(args), step='PARAMETER') np.random.seed(args.seed) tf.set_random_seed(args.seed) # Suppress TF warnings os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # set AMP os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if args.use_tf_amp else '0' # load dataset (train_data, validation_data_input, validation_data_true, test_data_input, test_data_true) = load_and_parse_ML_20M(args.data_dir) # make sure all dims and sizes are divisible by 8 number_of_train_users, number_of_items = train_data.shape number_of_items = round_8(number_of_items) for data in [train_data, validation_data_input, validation_data_true, test_data_input, test_data_true]: number_of_users, _ = data.shape data.resize(number_of_users, number_of_items) number_of_users, number_of_items = train_data.shape encoder_dims = [number_of_items, 600, 200] vae = VAE(train_data, encoder_dims, total_anneal_steps=args.total_anneal_steps, anneal_cap=args.anneal_cap, batch_size_train=args.local_batch_size, batch_size_validation=args.batch_size_validation, lam=args.lam, lr=args.lr, beta1=args.beta1, beta2=args.beta2, activation=args.activation, xla=args.xla, checkpoint_dir=args.checkpoint_dir, trace=args.trace, top_results=args.top_results) metrics = {'ndcg@100': partial(ndcg, R=100), 'recall@20': partial(recall, R=20), 'recall@50': partial(recall, R=50)} if args.train: vae.train(n_epochs=args.epochs, validation_data_input=validation_data_input, validation_data_true=validation_data_true, metrics=metrics, validation_step=args.validation_step) if args.test and hvd.size() <= 1: test_results = vae.test(test_data_input=test_data_input, test_data_true=test_data_true, metrics=metrics) for k, v in test_results.items(): print("{}:\t{}".format(k, v)) elif args.test and hvd.size() > 1: print("Testing is not supported with horovod multigpu yet") if args.inference_benchmark and hvd.size() <= 1: # use the train data to get accurate throughput numbers for inference # the test and validation sets are too small to measure this accurately # vae.inference_benchmark() _ = vae.test(test_data_input=train_data, test_data_true=train_data, metrics={}) elif args.test and hvd.size() > 1: print("Testing is not supported with horovod multigpu yet") if args.inference: input_data = np.random.randint(low=0, high=10000, size=10) recommendations = vae.query(input_data=input_data) print('Recommended item indices: ', recommendations) vae.close_session() dllogger.flush()