def setup_training(args):
    assert (torch.cuda.is_available())

    if args.local_rank == -1:
        device = torch.device("cuda")
        args.n_gpu = torch.cuda.device_count()
        args.allreduce_post_accumulation = False
        args.allreduce_post_accumulation_fp16 = False
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='smddp',
                                             init_method='env://')
        args.n_gpu = 1

    if args.gradient_accumulation_steps == 1:
        args.allreduce_post_accumulation = False
        args.allreduce_post_accumulation_fp16 = False

    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    print(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, args.n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    if args.train_batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible"
            .format(args.gradient_accumulation_steps, args.train_batch_size))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if not args.do_train:
        raise ValueError(" `do_train`  must be True.")

    if not args.resume_from_checkpoint and os.path.exists(
            args.output_dir) and (os.listdir(args.output_dir) and any(
                [i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    if (not args.resume_from_checkpoint
            or not os.path.exists(args.output_dir)) and is_main_process():
        os.makedirs(args.output_dir, exist_ok=True)

    return device, args
Beispiel #2
0
def main():
    args = parse_args()
    dllogger.init(backends=[
        dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   filename=args.log_path),
        dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
    ])

    dllogger.log(data=vars(args), step='PARAMETER')

    model = NeuMF(nb_users=args.n_users,
                  nb_items=args.n_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    model = model.cuda()

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        model.load_state_dict(state_dict)

    if args.opt_level == "O2":
        model = amp.initialize(model,
                               opt_level=args.opt_level,
                               keep_batchnorm_fp32=False,
                               loss_scale='dynamic')
    model.eval()

    users = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_users)
    items = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_items)

    latencies = []
    for _ in range(args.num_batches):
        torch.cuda.synchronize()
        start = time.time()
        predictions = model(users, items, sigmoid=True)
        torch.cuda.synchronize()
        latencies.append(time.time() - start)

    dllogger.log(data={
        'batch_size':
        args.batch_size,
        'best_inference_throughput':
        args.batch_size / min(latencies),
        'best_inference_latency':
        min(latencies),
        'mean_inference_throughput':
        args.batch_size / np.mean(latencies),
        'mean_inference_latency':
        np.mean(latencies),
        'inference_latencies':
        latencies
    },
                 step=tuple())
    dllogger.flush()
    return
Beispiel #3
0
def init_logging(log_path):
    json_backend = dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                              filename=log_path)
    stdout_backend = dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)

    stdout_backend._metadata['best_auc'].update({'format': '0:.5f'})
    stdout_backend._metadata['best_epoch'].update({'format': '0:.2f'})
    stdout_backend._metadata['average_train_throughput'].update({'format': ':.2e'})
    stdout_backend._metadata['average_test_throughput'].update({'format': ':.2e'})

    dllogger.init(backends=[json_backend, stdout_backend])
Beispiel #4
0
def setup_logging(args):
    logging.basicConfig(level=logging.DEBUG,
                        format='{asctime}:{levelname}: {message}',
                        style='{')
    if hvd.rank() == 0:
        dllogger.init(backends=[
            dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT,
                                   step_format=format_step),
            dllogger.JSONStreamBackend(
                dllogger.Verbosity.VERBOSE,
                os.path.join(args.workspace, args.dllogger_log)),
        ])
    else:
        dllogger.init([])
def setup_training(args):

    assert (torch.cuda.is_available())

    global ort_supplement
    import ort_supplement.ort_supplement as ort_supplement
    device = ort_supplement.setup_onnxruntime_with_mpi(args)

    if is_main_process(args):
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    print(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, args.n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    if args.train_batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible"
            .format(args.gradient_accumulation_steps, args.train_batch_size))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if not args.do_train:
        raise ValueError(" `do_train`  must be True.")

    if not args.resume_from_checkpoint and os.path.exists(
            args.output_dir) and (os.listdir(args.output_dir) and any(
                [i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    if (not args.resume_from_checkpoint
            or not os.path.exists(args.output_dir)) and is_main_process(args):
        os.makedirs(args.output_dir, exist_ok=True)

    return device, args
def main():
    args = parse_args()
    dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                                       filename=args.log_path),
                            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])

    dllogger.log(data=vars(args), step='PARAMETER')

    model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors,
                  mlp_layer_sizes=args.layers, dropout=args.dropout)

    model = model.cuda()

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        model.load_state_dict(state_dict)

    if args.fp16:
        model.half()
    model.eval()
    
    batch_sizes = args.batch_sizes.split(',')
    batch_sizes = [int(s) for s in batch_sizes]

    result_data = {}
    for batch_size in batch_sizes:
        print('benchmarking batch size: ', batch_size)
        users = torch.cuda.LongTensor(batch_size).random_(0, args.n_users)
        items = torch.cuda.LongTensor(batch_size).random_(0, args.n_items)

        latencies = []
        for _ in range(args.num_batches):
            torch.cuda.synchronize()
            start = time.time()
            _ = model(users, items, sigmoid=True)
            torch.cuda.synchronize()
            latencies.append(time.time() - start)

        result_data[f'batch_{batch_size}_mean_throughput'] = batch_size / np.mean(latencies)
        result_data[f'batch_{batch_size}_mean_latency'] = np.mean(latencies)
        result_data[f'batch_{batch_size}_p90_latency'] = np.percentile(latencies, 0.90)
        result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(latencies, 0.95)
        result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(latencies, 0.99)

    dllogger.log(data=result_data, step=tuple())
    dllogger.flush()
    return
    def __init__(self, name, json_output=None, print_freq=20):
        self.name = name
        self.train_loss_logger = IterationAverageMeter("Training loss")
        self.train_epoch_time_logger = EpochMeter("Training 1 epoch time")
        self.val_acc_logger = EpochMeter("Validation accuracy")
        self.print_freq = print_freq

        backends = [DLLogger.StdOutBackend(DLLogger.Verbosity.DEFAULT)]
        if json_output:
            backends.append(
                DLLogger.JSONStreamBackend(DLLogger.Verbosity.VERBOSE,
                                           json_output))

        DLLogger.init(backends)

        self.epoch = 0
        self.train_iter = 0
        self.summary = {}
Beispiel #8
0
def init_logger(args, full, logger):
    if full:
        logger.setLevel(logging.INFO)
        log_path = os.path.join(args.results_dir, args.log_filename)
        os.makedirs(args.results_dir, exist_ok=True)
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
        logger.warning('command line arguments: {}'.format(json.dumps(vars(args))))
        if not os.path.exists(args.results_dir):
            os.mkdir(args.results_dir)

        with open('{}/args.json'.format(args.results_dir), 'w') as f:
            json.dump(vars(args), f, indent=4)
    else:
        logger.setLevel(logging.ERROR)
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')
Beispiel #9
0
def init_logging(log_path, FLAGS):
    json_backend = dllogger.JSONStreamBackend(
        verbosity=dllogger.Verbosity.VERBOSE, filename=log_path)
    stdout_backend = dllogger.StdOutBackend(
        verbosity=dllogger.Verbosity.VERBOSE)

    stdout_backend._metadata['auc'].update({'format': '0:.5f'})
    stdout_backend._metadata['throughput'].update({'format': ':.2e'})
    stdout_backend._metadata['mean_step_time_ms'].update({'format': '0:.3f'})
    stdout_backend._metadata['mean_inference_throughput'].update(
        {'format': ':.2e'})
    stdout_backend._metadata['mean_inference_latency'].update(
        {'format': '0:.5f'})
    for percentile in [90, 95, 99]:
        stdout_backend._metadata[f'p{percentile}_inference_latency'].update(
            {'format': '0:.5f'})

    dllogger.init(backends=[json_backend, stdout_backend])

    if hvd.rank() == 0:
        dllogger.log(data=FLAGS.flag_values_dict(), step='PARAMETER')
        print("Command line flags:")
        print(json.dumps(FLAGS.flag_values_dict(), indent=4))
def setup_logger(args):
    os.makedirs(args.log_dir, exist_ok=True)
    if not args.json_summary:
        log_path = os.path.join(args.log_dir,
                                'dllogger_rank{}.log'.format(get_rank()))
    else:
        log_path = "{}_rank{}".format(args.json_summary, get_rank())

    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=1, filename=log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=1, filename=log_path)
        ])

    for k, v in vars(args).items():
        dllogger.log(step='PARAMETER', data={k: v}, verbosity=0)

    container_setup_info = {
        'NVIDIA_TENSORFLOW_VERSION':
        os.environ.get('NVIDIA_TENSORFLOW_VERSION'),
        'TENSORFLOW_VERSION': os.environ.get('TENSORFLOW_VERSION'),
        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
    }
    dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)
Beispiel #11
0
if __name__ == "__main__":

    tf.logging.set_verbosity(tf.logging.ERROR)

    FLAGS = parse_cmdline(model_architectures.keys())
    hvd.init()

    if hvd.rank() == 0:
        log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
        os.makedirs(FLAGS.results_dir, exist_ok=True)

        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])
    dllogger.log(data=vars(FLAGS), step='PARAMETER')

    runner = Runner(
        # ========= Model HParams ========= #
        n_classes=1001,
        architecture=FLAGS.arch,
        input_format='NHWC',
        compute_format=FLAGS.data_format,
        dtype=tf.float32,
        n_channels=3,
        height=224 if FLAGS.data_dir else FLAGS.synthetic_data_size,
        width=224 if FLAGS.data_dir else FLAGS.synthetic_data_size,
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Inference")
    parser.add_argument(
        "--config-file",
        default=
        "/workspace/object_detection/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=os.getenv('LOCAL_RANK', 0))
    parser.add_argument("--json-summary",
                        help="Out file for DLLogger",
                        default="dllogger_inference.out",
                        type=str)
    parser.add_argument(
        "--skip-eval",
        dest="skip_eval",
        help="Do not eval the predictions",
        action="store_true",
    )
    parser.add_argument(
        "--fp16",
        help="Mixed precision training",
        action="store_true",
    )
    parser.add_argument(
        "--amp",
        help="Mixed precision training",
        action="store_true",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()
    args.fp16 = args.fp16 or args.amp
    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    distributed = num_gpus > 1

    if distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    save_dir = ""
    logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank())
    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    save_dir = ""
    dllogger.log(step="PARAMETER", data={"config": cfg})
    dllogger.log(step="PARAMETER", data={"gpu_count": num_gpus})
    # dllogger.log(step="PARAMETER", data={"env_info": collect_env_info()})
    model = build_detection_model(cfg)
    model.to(cfg.MODEL.DEVICE)

    # Initialize mixed-precision if necessary
    if args.fp16:
        use_mixed_precision = True
    else:
        use_mixed_precision = cfg.DTYPE == "float16"
    amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)

    output_dir = cfg.OUTPUT_DIR
    checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
    _ = checkpointer.load(cfg.MODEL.WEIGHT)

    iou_types = ("bbox", )
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    if cfg.OUTPUT_DIR:
        for idx, dataset_name in enumerate(dataset_names):
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference",
                                         dataset_name)
            mkdir(output_folder)
            output_folders[idx] = output_folder
    data_loaders_val = make_data_loader(cfg,
                                        is_train=False,
                                        is_distributed=distributed)

    results = []
    for output_folder, dataset_name, data_loader_val in zip(
            output_folders, dataset_names, data_loaders_val):
        result = inference(
            model,
            data_loader_val,
            dataset_name=dataset_name,
            iou_types=iou_types,
            box_only=cfg.MODEL.RPN_ONLY,
            device=cfg.MODEL.DEVICE,
            expected_results=cfg.TEST.EXPECTED_RESULTS,
            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
            output_folder=output_folder,
            skip_eval=args.skip_eval,
            dllogger=dllogger,
        )
        synchronize()
        results.append(result)

    if is_main_process() and not args.skip_eval:
        map_results, raw_results = results[0]
        bbox_map = map_results.results["bbox"]['AP']
        segm_map = map_results.results["segm"]['AP']
        dllogger.log(step=tuple(),
                     data={
                         "BBOX_mAP": bbox_map,
                         "MASK_mAP": segm_map
                     })
Beispiel #13
0
def main():

    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--input_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain .hdf5 files  for the task.")
    parser.add_argument("--config_file",
                        default="bert_config.json",
                        type=str,
                        required=False,
                        help="The BERT model config")
    ckpt_group = parser.add_mutually_exclusive_group(required=True)
    ckpt_group.add_argument("--ckpt_dir",
                            default=None,
                            type=str,
                            help="The ckpt directory, e.g. /results")
    ckpt_group.add_argument("--ckpt_path",
                            default=None,
                            type=str,
                            help="Path to the specific checkpoint")

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--eval', dest='do_eval', action='store_true')
    group.add_argument('--prediction', dest='do_eval', action='store_false')
    ## Other parameters
    parser.add_argument(
        "--bert_model",
        default="bert-large-uncased",
        type=str,
        required=False,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--max_predictions_per_seq",
        default=80,
        type=int,
        help="The maximum total of masked tokens in input sequence")
    parser.add_argument("--ckpt_step",
                        default=-1,
                        type=int,
                        required=False,
                        help="The model checkpoint iteration, e.g. 1000")

    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "Total number of eval  steps to perform, otherwise use full dataset")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument("--log_path",
                        help="Out file for DLLogger",
                        default="/workspace/dllogger_inference.out",
                        type=str)

    args = parser.parse_args()

    if 'LOCAL_RANK' in os.environ:
        args.local_rank = int(os.environ['LOCAL_RANK'])

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")

    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        assert (args.local_rank != -1
                )  # only use torch.distributed for multi-gpu

    dllogger.log(
        step=
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16),
        data={})

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Prepare model
    config = BertConfig.from_json_file(args.config_file)
    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)
    model = BertForPreTraining(config)

    if args.ckpt_dir:
        if args.ckpt_step == -1:
            #retrieve latest model
            model_names = [
                f for f in os.listdir(args.ckpt_dir) if f.endswith(".pt")
            ]
            args.ckpt_step = max([
                int(x.split('.pt')[0].split('_')[1].strip())
                for x in model_names
            ])
            dllogger.log(step="load model saved at iteration",
                         data={"number": args.ckpt_step})
        model_file = os.path.join(args.ckpt_dir,
                                  "ckpt_" + str(args.ckpt_step) + ".pt")
    else:
        model_file = args.ckpt_path
    state_dict = torch.load(model_file, map_location="cpu")["model"]
    model.load_state_dict(state_dict, strict=False)

    if args.fp16:
        model.half(
        )  # all parameters and buffers are converted to half precision
    model.to(device)

    multi_gpu_training = args.local_rank != -1 and torch.distributed.is_initialized(
    )
    if multi_gpu_training:
        model = DDP(model)

    files = [
        os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
        if os.path.isfile(os.path.join(args.input_dir, f)) and 'test' in f
    ]
    files.sort()

    dllogger.log(step="***** Running Inference *****", data={})
    dllogger.log(step="  Inference batch", data={"size": args.eval_batch_size})

    model.eval()

    nb_instances = 0
    max_steps = args.max_steps if args.max_steps > 0 else np.inf
    global_step = 0
    total_samples = 0

    begin_infer = time.time()
    with torch.no_grad():
        if args.do_eval:
            final_loss = 0.0  #
            for data_file in files:
                dllogger.log(step="Opening ", data={"file": data_file})
                dataset = pretraining_dataset(
                    input_file=data_file,
                    max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                for step, batch in enumerate(
                        tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break
                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch  #\
                    loss = model(input_ids=input_ids,
                                 token_type_ids=segment_ids,
                                 attention_mask=input_mask,
                                 masked_lm_labels=masked_lm_labels,
                                 next_sentence_label=next_sentence_labels)
                    final_loss += loss.item()

                    global_step += 1

                total_samples += len(datasetloader)
                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            final_loss /= global_step
            if multi_gpu_training:
                final_loss = torch.tensor(final_loss, device=device)
                dist.all_reduce(final_loss)
                final_loss /= torch.distributed.get_world_size()
            if (not multi_gpu_training or
                (multi_gpu_training and torch.distributed.get_rank() == 0)):
                dllogger.log(step="Inference Loss",
                             data={"final_loss": final_loss.item()})

        else:  # inference
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            # start_t0 = time.time()
            for data_file in files:
                dllogger.log(step="Opening ", data={"file": data_file})
                dataset = pretraining_dataset(
                    input_file=data_file,
                    max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=args.eval_batch_size,
                                               num_workers=4,
                                               pin_memory=True)

                for step, batch in enumerate(
                        tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break

                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch  #\

                    lm_logits, nsp_logits = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  masked_lm_labels=None,
                                                  next_sentence_label=None)

                    nb_instances += input_ids.size(0)
                    global_step += 1

                total_samples += len(datasetloader)
                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            if (not multi_gpu_training or
                (multi_gpu_training and torch.distributed.get_rank() == 0)):
                dllogger.log(step="Done Inferring on samples", data={})

    end_infer = time.time()
    dllogger.log(step="Inference perf",
                 data={
                     "inference_sequences_per_second":
                     total_samples * args.eval_batch_size /
                     (end_infer - begin_infer)
                 })
Beispiel #14
0
def main():

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=os.getenv('LOCAL_RANK', 0))
    parser.add_argument("--max_steps",
                        type=int,
                        default=0,
                        help="Override number of training steps in the config")
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument("--fp16",
                        help="Mixed precision training",
                        action="store_true")
    parser.add_argument("--amp",
                        help="Mixed precision training",
                        action="store_true")
    parser.add_argument('--skip_checkpoint',
                        default=False,
                        action='store_true',
                        help="Whether to save checkpoints")
    parser.add_argument(
        "--json-summary",
        help="Out file for DLLogger",
        default="dllogger.out",
        type=str,
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )
    args = parser.parse_args()
    args.fp16 = args.fp16 or args.amp

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)

    # Redundant option - Override config parameter with command line input
    if args.max_steps > 0:
        cfg.SOLVER.MAX_ITER = args.max_steps

    if args.skip_checkpoint:
        cfg.SAVE_CHECKPOINT = False

    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(step="PARAMETER", data={"gpu_count": num_gpus})
    # dllogger.log(step="PARAMETER", data={"environment_info": collect_env_info()})
    dllogger.log(step="PARAMETER", data={"config_file": args.config_file})

    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()

    dllogger.log(step="PARAMETER", data={"config": cfg})

    if args.fp16:
        fp16 = True
    else:
        fp16 = False

    model, iters_per_epoch = train(cfg, args.local_rank, args.distributed,
                                   fp16, dllogger)

    if not args.skip_test:
        if not cfg.PER_EPOCH_EVAL:
            test_model(cfg, model, args.distributed, iters_per_epoch, dllogger)
def main(args):
    args.fp16 = args.fp16 or args.amp
    if args.server_ip and args.server_port:
        # Distant debugging - see
        # https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        logger.info("Waiting for debugger attach")
        ptvsd.enable_attach(
            address=(args.server_ip, args.server_port),
            redirect_output=True,
        )
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of
        # sychronizing nodes/GPUs.
        if not torch.distributed.is_initialized():
            torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, "
                "16-bits training: {}".format(
                    device,
                    n_gpu,
                    bool(args.local_rank != -1),
                    args.fp16,
                ))

    if not args.do_train and not args.do_eval and not args.do_predict:
        raise ValueError("At least one of `do_train`, `do_eval` or "
                         "`do_predict` must be True.")

    if is_main_process():
        if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
                and args.do_train):
            logger.warning("Output directory ({}) already exists and is not "
                           "empty.".format(args.output_dir))
    mkdir_by_main_process(args.output_dir)

    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(
                verbosity=dllogger.Verbosity.VERBOSE,
                filename=os.path.join(args.output_dir, 'dllogger.json'),
            ),
            dllogger.StdOutBackend(
                verbosity=dllogger.Verbosity.VERBOSE,
                step_format=format_step,
            ),
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(step="PARAMETER", data={"Config": [str(args)]})

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                             args.gradient_accumulation_steps))
    if args.gradient_accumulation_steps > args.train_batch_size:
        raise ValueError("gradient_accumulation_steps ({}) cannot be larger "
                         "train_batch_size ({}) - there cannot be a fraction "
                         "of one sample.".format(
                             args.gradient_accumulation_steps,
                             args.train_batch_size,
                         ))
    args.train_batch_size = (args.train_batch_size //
                             args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    dllogger.log(step="PARAMETER", data={"SEED": args.seed})

    processor = PROCESSORS[args.task_name]()
    num_labels = len(processor.get_labels())

    #tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    tokenizer = BertTokenizer(
        args.vocab_file,
        do_lower_case=args.do_lower_case,
        max_len=512,
    )  # for bert large

    num_train_optimization_steps = None
    if args.do_train:
        train_features = get_train_features(
            args.data_dir,
            args.bert_model,
            args.max_seq_length,
            args.do_lower_case,
            args.local_rank,
            args.train_batch_size,
            args.gradient_accumulation_steps,
            args.num_train_epochs,
            tokenizer,
            processor,
        )
        num_train_optimization_steps = int(
            len(train_features) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = (num_train_optimization_steps //
                                            torch.distributed.get_world_size())

    # Prepare model
    config = modeling.BertConfig.from_json_file(args.config_file)
    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)

    # modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training
    model = modeling.BertForSequenceClassification(
        config,
        num_labels=num_labels,
    )
    logger.info("USING CHECKPOINT from {}".format(args.init_checkpoint))

    checkpoint = torch.load(args.init_checkpoint, map_location='cpu')
    checkpoint = checkpoint["model"] if "model" in checkpoint.keys(
    ) else checkpoint
    model.load_state_dict(checkpoint, strict=False)
    logger.info("USED CHECKPOINT from {}".format(args.init_checkpoint))
    dllogger.log(
        step="PARAMETER",
        data={
            "num_parameters":
            sum([p.numel() for p in model.parameters() if p.requires_grad]),
        },
    )

    model.to(device)
    # Prepare optimizer
    model, optimizer, scheduler = init_optimizer_and_amp(
        model,
        args.learning_rate,
        args.loss_scale,
        args.warmup_proportion,
        num_train_optimization_steps,
        args.fp16,
    )

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from "
                              "https://www.github.com/nvidia/apex to use "
                              "distributed and fp16 training.")
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    loss_fct = torch.nn.CrossEntropyLoss()

    results = {}
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        train_data = gen_tensor_dataset(train_features)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(
            train_data,
            sampler=train_sampler,
            batch_size=args.train_batch_size,
        )

        global_step = 0
        nb_tr_steps = 0
        tr_loss = 0
        latency_train = 0.0
        nb_tr_examples = 0
        model.train()
        tic_train = time.perf_counter()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                if args.max_steps > 0 and global_step > args.max_steps:
                    break
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                logits = model(input_ids, segment_ids, input_mask)
                loss = loss_fct(
                    logits.view(-1, num_labels),
                    label_ids.view(-1),
                )
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up for BERT
                        # which FusedAdam doesn't do
                        scheduler.step()

                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
        latency_train = time.perf_counter() - tic_train
        tr_loss = tr_loss / nb_tr_steps
        results.update({
            'global_step':
            global_step,
            'train:loss':
            tr_loss,
            'train:latency':
            latency_train,
            'train:num_samples_per_gpu':
            nb_tr_examples,
            'train:num_steps':
            nb_tr_steps,
            'train:throughput':
            get_world_size() * nb_tr_examples / latency_train,
        })
        if is_main_process() and not args.skip_checkpoint:
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(
                {"model": model_to_save.state_dict()},
                os.path.join(args.output_dir, modeling.WEIGHTS_NAME),
            )
            with open(
                    os.path.join(args.output_dir, modeling.CONFIG_NAME),
                    'w',
            ) as f:
                f.write(model_to_save.config.to_json_string())

    if (args.do_eval or args.do_predict) and is_main_process():
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features, label_map = convert_examples_to_features(
            eval_examples,
            processor.get_labels(),
            args.max_seq_length,
            tokenizer,
        )
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_data = gen_tensor_dataset(eval_features)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(
            eval_data,
            sampler=eval_sampler,
            batch_size=args.eval_batch_size,
        )

        model.eval()
        preds = None
        out_label_ids = None
        eval_loss = 0
        nb_eval_steps, nb_eval_examples = 0, 0
        cuda_events = [(torch.cuda.Event(enable_timing=True),
                        torch.cuda.Event(enable_timing=True))
                       for _ in range(len(eval_dataloader))]
        for i, (input_ids, input_mask, segment_ids, label_ids) in tqdm(
                enumerate(eval_dataloader),
                desc="Evaluating",
        ):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                cuda_events[i][0].record()
                logits = model(input_ids, segment_ids, input_mask)
                cuda_events[i][1].record()
                if args.do_eval:
                    eval_loss += loss_fct(
                        logits.view(-1, num_labels),
                        label_ids.view(-1),
                    ).mean().item()

            nb_eval_steps += 1
            nb_eval_examples += input_ids.size(0)
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = label_ids.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    label_ids.detach().cpu().numpy(),
                    axis=0,
                )
        torch.cuda.synchronize()
        eval_latencies = [
            event_start.elapsed_time(event_end)
            for event_start, event_end in cuda_events
        ]
        eval_latencies = list(sorted(eval_latencies))

        def infer_latency_sli(threshold):
            index = int(len(eval_latencies) * threshold) - 1
            index = min(max(index, 0), len(eval_latencies) - 1)
            return eval_latencies[index]

        eval_throughput = (args.eval_batch_size /
                           (np.mean(eval_latencies) / 1000))

        results.update({
            'eval:num_samples_per_gpu': nb_eval_examples,
            'eval:num_steps': nb_eval_steps,
            'infer:latency(ms):50%': infer_latency_sli(0.5),
            'infer:latency(ms):90%': infer_latency_sli(0.9),
            'infer:latency(ms):95%': infer_latency_sli(0.95),
            'infer:latency(ms):99%': infer_latency_sli(0.99),
            'infer:latency(ms):100%': infer_latency_sli(1.0),
            'infer:latency(ms):avg': np.mean(eval_latencies),
            'infer:latency(ms):std': np.std(eval_latencies),
            'infer:latency(ms):sum': np.sum(eval_latencies),
            'infer:throughput(samples/s):avg': eval_throughput,
        })
        preds = np.argmax(preds, axis=1)
        if args.do_predict:
            dump_predictions(
                os.path.join(args.output_dir, 'predictions.json'),
                label_map,
                preds,
                eval_examples,
            )
        if args.do_eval:
            results['eval:loss'] = eval_loss / nb_eval_steps
            eval_result = compute_metrics(args.task_name, preds, out_label_ids)
            results.update(eval_result)

    if is_main_process():
        logger.info("***** Results *****")
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))
        with open(os.path.join(args.output_dir, "results.txt"), "w") as writer:
            json.dump(results, writer)
        dllogger_queries_from_results = {
            'exact_match': 'acc',
            'F1': 'f1',
            'e2e_train_time': 'train:latency',
            'training_sequences_per_second': 'train:throughput',
            'e2e_inference_time':
            ('infer:latency(ms):sum', lambda x: x / 1000),
            'inference_sequences_per_second':
            'infer:throughput(samples/s):avg',
        }
        for key, query in dllogger_queries_from_results.items():
            results_key, convert = (query if isinstance(query, tuple) else
                                    (query, lambda x: x))
            if results_key not in results:
                continue
            dllogger.log(
                step=tuple(),
                data={key: convert(results[results_key])},
            )
    dllogger.flush()
    return results
def main():
    args = parse_args()

    if args.amp:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"

    dllogger.init(backends=[
        dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   filename=args.log_path),
        dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
    ])
    dllogger.log(data=vars(args), step='PARAMETER')

    batch_sizes = args.batch_sizes.split(',')
    batch_sizes = [int(s) for s in batch_sizes]
    result_data = {}

    for batch_size in batch_sizes:
        print('Benchmarking batch size', batch_size)
        tf.reset_default_graph()

        # Input tensors
        users = tf.placeholder(tf.int32, shape=(None, ))
        items = tf.placeholder(tf.int32, shape=(None, ))
        dropout = tf.placeholder_with_default(0.0, shape=())

        # Model ops and saver
        logits_op = ncf_model_ops(users=users,
                                  items=items,
                                  labels=None,
                                  dup_mask=None,
                                  mode='INFERENCE',
                                  params={
                                      'fp16': False,
                                      'val_batch_size': batch_size,
                                      'num_users': args.n_users,
                                      'num_items': args.n_items,
                                      'num_factors': args.factors,
                                      'mf_reg': 0,
                                      'layer_sizes': args.layers,
                                      'layer_regs': [0. for i in args.layers],
                                      'dropout': 0.0,
                                      'sigmoid': True,
                                      'top_k': None,
                                      'learning_rate': None,
                                      'beta_1': None,
                                      'beta_2': None,
                                      'epsilon': None,
                                      'loss_scale': None,
                                  })

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        if args.xla:
            config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

        sess = tf.Session(config=config)

        saver = tf.train.Saver()
        if args.load_checkpoint_path:
            saver.restore(sess, args.load_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        sess.run(tf.local_variables_initializer())

        users_batch = np.random.randint(size=batch_size,
                                        low=0,
                                        high=args.n_users)
        items_batch = np.random.randint(size=batch_size,
                                        low=0,
                                        high=args.n_items)

        latencies = []
        for i in range(args.num_batches):
            start = time.time()
            _ = sess.run(logits_op,
                         feed_dict={
                             users: users_batch,
                             items: items_batch,
                             dropout: 0.0
                         })
            end = time.time()

            if i < 10:  # warmup iterations
                continue

            latencies.append(end - start)

        result_data[
            f'batch_{batch_size}_mean_throughput'] = batch_size / np.mean(
                latencies)
        result_data[f'batch_{batch_size}_mean_latency'] = np.mean(latencies)
        result_data[f'batch_{batch_size}_p90_latency'] = np.percentile(
            latencies, 90)
        result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(
            latencies, 95)
        result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(
            latencies, 99)

    dllogger.log(data=result_data, step=tuple())
    dllogger.flush()
Beispiel #17
0
def main():
    args = parse_args()
    print("init distributed")
    init_distributed(args)
    if args.rank == 0:
        wandb.init()
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')

    torch.manual_seed(1)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()

    train_ratings = torch.load(args.data + '/train_ratings.pt',
                               map_location=torch.device('cuda:{}'.format(
                                   args.local_rank)))
    test_ratings = torch.load(args.data + '/test_ratings.pt',
                              map_location=torch.device('cuda:{}'.format(
                                  args.local_rank)))
    test_negs = torch.load(args.data + '/test_negatives.pt',
                           map_location=torch.device('cuda:{}'.format(
                               args.local_rank)))

    valid_negative = test_negs.shape[1]

    nb_maxs = torch.max(train_ratings, 0)[0]
    nb_users = nb_maxs[0].item() + 1
    nb_items = nb_maxs[1].item() + 1

    all_test_users = test_ratings.shape[0]

    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(
        test_ratings, test_negs, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
    model = NeuMF(nb_users,
                  nb_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout).cuda()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 betas=(args.beta1, args.beta2),
                                 eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(reduction='none').cuda(
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU

    if args.distributed:
        model = DDP(model, device_ids=[args.local_rank])

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             distributed=args.distributed)
        val_time = time.time() - start
        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time

        dllogger.log(step=tuple(),
                     data={
                         'best_eval_throughput': eval_throughput,
                         'hr@10': hr
                     })
        return

    max_hr = 0
    best_epoch = 0
    train_throughputs, eval_throughputs = [], []

    for epoch in range(args.epochs):

        begin = time.time()

        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(
            train_ratings, nb_items, args)
        num_batches = len(epoch_users)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                user = epoch_users[batch_idx]
                item = epoch_items[batch_idx]
                label = epoch_label[batch_idx].view(-1, 1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)

                loss.backward()
            optimizer.step()
            if args.rank == 0:
                wandb.log({"Test loss": loss})
            for p in model.parameters():
                p.grad = None

        del epoch_users, epoch_items, epoch_label
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)

        hr, ndcg = val_epoch(model,
                             test_users,
                             test_items,
                             dup_mask,
                             real_indices,
                             args.topk,
                             samples_per_user=valid_negative + 1,
                             num_user=all_test_users,
                             epoch=epoch,
                             distributed=args.distributed)

        val_time = time.time() - begin

        eval_size = all_test_users * (valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        dllogger.log(step=(epoch, ),
                     data={
                         'train_throughput': train_throughput,
                         'hr@10': hr,
                         'train_epoch_time': train_time,
                         'validation_epoch_time': val_time,
                         'eval_throughput': eval_throughput
                     })
        if args.rank == 0:
            wandb.log({"Test hit rate": hr})
            wandb.log({"Test train epoch time": train_time})
            wandb.log({"Test train throughput": train_throughput})
        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            best_epoch = epoch
            #            save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth')
            print("New best hr!")
            #            torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                break

    if args.local_rank == 0:
        dllogger.log(data={
            'best_train_throughput':
            max(train_throughputs),
            'best_eval_throughput':
            max(eval_throughputs),
            'mean_train_throughput':
            np.mean(train_throughputs),
            'mean_eval_throughput':
            np.mean(eval_throughputs),
            'best_accuracy':
            max_hr,
            'best_epoch':
            best_epoch,
            'time_to_target':
            time.time() - main_start_time,
            'time_to_best_model':
            best_model_timestamp - main_start_time
        },
                     step=tuple())
Beispiel #18
0
                glob.glob(os.path.join(args.output_dir, "*.ckpt"),
                          recursive=True)))
        if checkpoints:
            model.hparams.test_checkpoint = checkpoints[-1]
            trainer.resume_from_checkpoint = checkpoints[-1]
        trainer.logger.log_hyperparams(model.hparams)
        trainer.test()
    return model


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser = pl.Trainer.add_argparse_args(parser)
    parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())

    args = parser.parse_args()

    if get_rank() == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    main(args)

    dllogger.flush()
Beispiel #19
0
def init_dllogger(log_dir):
    Logger.init([
        Logger.StdOutBackend(Logger.Verbosity.DEFAULT, step_format=format_step),
        Logger.JSONStreamBackend(Logger.Verbosity.VERBOSE, log_dir)
    ])
Beispiel #20
0
def run_generate(verbose=True):
    """

    Takes input text, generates output, and then using reference calculates the BLEU scores.

    The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed.

    Args:
        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout

    Returns:
        a tuple: ``(scores, params}``
        - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}``
        - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}``
    """

    parser = argparse.ArgumentParser()
    parser.add_argument("model_path",
                        type=str,
                        help="like facebook/bart-large-cnn or path to ckpt")
    parser.add_argument("config_path", type=str, help="path to config")
    parser.add_argument("data_dir", type=str, help="like cnn_dm/test.source")
    parser.add_argument("save_path", type=str, help="where to save summaries")
    parser.add_argument("--type_path",
                        type=str,
                        required=False,
                        default="test",
                        help="like cnn_dm/test.target")
    parser.add_argument("--device",
                        type=str,
                        required=False,
                        default=DEFAULT_DEVICE,
                        help="cuda, cuda:1, cpu etc.")
    parser.add_argument("--prefix",
                        type=str,
                        required=False,
                        default=None,
                        help="will be added to the begininng of src examples")
    parser.add_argument("--task",
                        type=str,
                        default="summarization",
                        help="used for task_specific_params + metrics")
    parser.add_argument("--bs",
                        type=int,
                        default=8,
                        required=False,
                        help="batch size")
    parser.add_argument("--n_obs",
                        type=int,
                        default=None,
                        required=False,
                        help="How many observations. Defaults to all.")
    parser.add_argument("--num_return_sequences",
                        type=int,
                        default=1,
                        required=False,
                        help="How many sequences to return")
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--dump-args",
                        action="store_true",
                        help="print the custom hparams with the results")
    parser.add_argument(
        "--info",
        nargs="?",
        type=str,
        const=datetime_now(),
        help=
        "use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g. lang=en-ru. If no value is passed, the current datetime string will be used.",
    )
    parser.add_argument("--eval_max_gen_length",
                        type=int,
                        default=None,
                        help="never generate more than n tokens")
    parser.add_argument(
        "--eval_beams",
        type=int,
        default=None,
        required=False,
        help="# beams to use. 0 corresponds to not using beam search.")
    parser.add_argument(
        "--max_source_length",
        default=1024,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument(
        "--max_target_length",
        default=142,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument(
        "--sync_timeout",
        type=int,
        default=600,
        required=False,
        help=
        "How long should master process wait for other processes to finish.",
    )
    parser.add_argument("--debug", action="store_true")
    parser.add_argument('--json-summary',
                        type=str,
                        default="results/dllogger.json",
                        help='If provided, the json summary will be written to'
                        'the specified file.')
    parser.add_argument(
        '--distill',
        type=str,
        default=None,
        help="string indicating how model is distilled, only sft supported",
        choices=["sft", None])
    parser.add_argument(
        '--layers',
        type=str,
        default=None,
        help=
        "string indicating which teacher layers remain, split by '-' (ex. 0-6-11)"
    )
    parser.add_argument('--do_encoder',
                        action="store_true",
                        default=False,
                        help="if true encoder distilled")
    parser.add_argument('--do_decoder',
                        action="store_true",
                        default=False,
                        help="if true decoder distilled")

    dist = parser.add_argument_group('distributed setup')
    dist.add_argument('--local_rank',
                      type=int,
                      default=os.getenv('LOCAL_RANK', 0),
                      help='Used for multi-process training.')

    start_time = time.time()

    # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate
    args, rest = parser.parse_known_args()
    parsed_args = parse_numeric_n_bool_cl_kwargs(rest)

    if args.local_rank <= 0:
        print(args)
        print(rest)

    # Initialize device and distributed backend
    utils.distributed_utils.init_distributed(args.device == "cuda")
    if utils.distributed_utils.get_world_size() > 1:
        utils.distributed_utils.set_affinity(args.local_rank)
        torch.cuda.set_device(args.local_rank)

    if Path(args.json_summary).exists():
        warnings.warn(
            f"json_summary {args.json_summary} will be overwritten unless you type ctrl-c."
        )

    if utils.distributed_utils.get_rank() == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    if parsed_args and verbose:
        print(f"parsed the following generate kwargs: {parsed_args}")

    Path(args.save_path).parent.mkdir(exist_ok=True)
    json_save_path = Path(args.save_path + "/tmp")
    Path(json_save_path).mkdir(exist_ok=True)  # this handles locking.

    if args.layers:
        num_layers = len(args.layers.split('-'))
    else:
        num_layers = None

    results, num_replicas, runtime_metrics = generate_summaries_or_translations(
        args.data_dir,
        json_save_path,
        args.model_path,
        args.config_path,
        batch_size=args.bs,
        device=args.device,
        fp16=args.fp16,
        task=args.task,
        prefix=args.prefix,
        eval_beams=args.eval_beams,
        max_source_length=args.max_source_length,
        max_target_length=args.max_target_length,
        eval_max_gen_length=args.eval_max_gen_length,
        n_obs=args.n_obs,
        type_path=args.type_path,
        num_return_sequences=args.num_return_sequences,
        distill=args.distill,
        num_layers=num_layers,
        do_encoder=args.do_encoder,
        do_decoder=args.do_decoder,
        **parsed_args,
    )

    if args.local_rank <= 0:
        save_path = Path(args.save_path)
        save_path.mkdir(exist_ok=True)
        partial_results = gather_results_from_each_node(
            num_replicas, json_save_path, args.sync_timeout)
        preds, time_list = combine_partial_results(partial_results)
        if args.num_return_sequences > 1:
            save_path = save_path.joinpath("pseudolabel_results.json")
            print(
                f"Saving aggregated results at {save_path}, intermediate in {json_save_path}/"
            )
            save_json(preds, save_path)
            return
        tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target")
        labels = [x.rstrip() for x in open(tgt_file).readlines()][:len(preds)]

        # Calculate metrics, save metrics,  and save _generations.txt
        calc_bleu = "translation" in args.task
        score_fn = calculate_bleu if calc_bleu else calculate_rouge
        metric_name = "bleu" if calc_bleu else "rouge"
        metrics: Dict = score_fn(preds, labels)
        metrics["n_obs"] = len(preds)
        runtime = time.time() - start_time
        metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4)
        metrics["n_gpus"] = num_replicas
        metrics.update(runtime_metrics)

        time_list.sort()
        metrics["inference_latency_mean"] = np.mean(time_list)
        metrics["inference_latency_conf_50"] = max(
            time_list[:int(len(time_list) * 0.50)])
        metrics["inference_latency_conf_90"] = max(
            time_list[:int(len(time_list) * 0.90)])
        metrics["inference_latency_conf_95"] = max(
            time_list[:int(len(time_list) * 0.95)])
        metrics["inference_latency_conf_99"] = max(
            time_list[:int(len(time_list) * 0.99)])
        metrics["inference_latency_conf_100"] = max(
            time_list[:int(len(time_list) * 1)])
        metrics["inference_throughput_mean"] = len(preds) * 1.0 / sum(
            time_list)

        metrics_save_path = save_path.joinpath(
            f"{args.type_path}_{metric_name}.json")
        save_json(metrics, metrics_save_path, indent=None)
        dllogger.log(step=tuple(), data=metrics)
        print(metrics)
        write_txt_file(preds,
                       save_path.joinpath(f"{args.type_path}_generations.txt"))
        if args.debug:
            write_txt_file(labels,
                           save_path.joinpath(f"{args.type_path}.target"))
        else:
            shutil.rmtree(json_save_path)

    dllogger.flush()
Beispiel #21
0
def setup_training(args):

    #assert (torch.cuda.is_available())
    if args.use_habana:
        sys.path.append(
            os.path.realpath(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             "../../../common")))
        from library_loader import load_habana_module
        load_habana_module()
        device = torch.device("habana")

        if args.hmp:
            print(args.hmp_bf16)
            from hmp import hmp
            hmp.convert(opt_level=args.hmp_opt_level,
                        bf16_file_path=args.hmp_bf16,
                        fp32_file_path=args.hmp_fp32,
                        isVerbose=args.hmp_verbose)

        if args.use_jit_trace:
            enable_tracing()

        args.n_pu = 1
        args.allreduce_post_accumulation = False
        args.allreduce_post_accumulation_fp16 = False
        if args.local_rank != -1:
            if os.getenv('HCL_CONFIG_PATH') is None:
                print("HCL_CONFIG_PATH is not set")
                exit(0)
            os.environ["ID"] = str(args.local_rank)
            args.world_size = int(os.environ["WORLD_SIZE"])
            args.rank = int(os.environ["RANK"])
            torch.distributed.init_process_group('hcl',
                                                 rank=args.rank,
                                                 world_size=args.world_size)

    elif args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        if device == torch.device("cuda"):
            args.n_pu = torch.cuda.device_count()
        else:
            args.n_pu = 1

        args.allreduce_post_accumulation = False
        args.allreduce_post_accumulation_fp16 = False
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.n_pu = 1

    if args.gradient_accumulation_steps == 1:
        args.allreduce_post_accumulation = False
        args.allreduce_post_accumulation_fp16 = False

    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    print(
        "device: {} n_pu: {}, distributed training: {}, 16-bits training: {}".
        format(device, args.n_pu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    if args.train_batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible"
            .format(args.gradient_accumulation_steps, args.train_batch_size))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if not args.do_train:
        raise ValueError(" `do_train`  must be True.")

    if not args.resume_from_checkpoint and os.path.exists(
            args.output_dir) and (os.listdir(args.output_dir) and any(
                [i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    if (not args.resume_from_checkpoint
            or not os.path.exists(args.output_dir)) and is_main_process():
        os.makedirs(args.output_dir, exist_ok=True)

    return device, args
def main():
    script_start = time.time()
    hvd_init()
    mpi_comm = MPI.COMM_WORLD
    args = parse_args()

    if hvd.rank() == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')

    if args.seed is not None:
        tf.random.set_random_seed(args.seed)
        np.random.seed(args.seed)
        cp.random.seed(args.seed)

    if args.amp:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
    if "TF_ENABLE_AUTO_MIXED_PRECISION" in os.environ \
       and os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] == "1":
        args.fp16 = False

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)
    final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt')

    # Load converted data and get statistics
    train_df = pd.read_pickle(args.data + '/train_ratings.pickle')
    test_df = pd.read_pickle(args.data + '/test_ratings.pickle')
    nb_users, nb_items = train_df.max() + 1

    # Extract train and test feature tensors from dataframe
    pos_train_users = train_df.iloc[:, 0].values.astype(np.int32)
    pos_train_items = train_df.iloc[:, 1].values.astype(np.int32)
    pos_test_users = test_df.iloc[:, 0].values.astype(np.int32)
    pos_test_items = test_df.iloc[:, 1].values.astype(np.int32)
    # Negatives indicator for negatives generation
    neg_mat = np.ones((nb_users, nb_items), dtype=np.bool)
    neg_mat[pos_train_users, pos_train_items] = 0

    # Get the local training/test data
    train_users, train_items, train_labels = get_local_train_data(
        pos_train_users, pos_train_items, args.negative_samples)
    test_users, test_items = get_local_test_data(pos_test_users,
                                                 pos_test_items)

    # Create and run Data Generator in a separate thread
    data_generator = DataGenerator(
        args.seed,
        hvd.rank(),
        nb_users,
        nb_items,
        neg_mat,
        train_users,
        train_items,
        train_labels,
        args.batch_size // hvd.size(),
        args.negative_samples,
        test_users,
        test_items,
        args.valid_users_per_batch,
        args.valid_negative,
    )

    # Create tensorflow session and saver
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    if args.xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    sess = tf.Session(config=config)

    # Input tensors
    users = tf.placeholder(tf.int32, shape=(None, ))
    items = tf.placeholder(tf.int32, shape=(None, ))
    labels = tf.placeholder(tf.int32, shape=(None, ))
    is_dup = tf.placeholder(tf.float32, shape=(None, ))
    dropout = tf.placeholder_with_default(args.dropout, shape=())
    # Model ops and saver
    hit_rate, ndcg, eval_op, train_op = ncf_model_ops(
        users,
        items,
        labels,
        is_dup,
        params={
            'fp16': args.fp16,
            'val_batch_size': args.valid_negative + 1,
            'top_k': args.topk,
            'learning_rate': args.learning_rate,
            'beta_1': args.beta1,
            'beta_2': args.beta2,
            'epsilon': args.eps,
            'num_users': nb_users,
            'num_items': nb_items,
            'num_factors': args.factors,
            'mf_reg': 0,
            'layer_sizes': args.layers,
            'layer_regs': [0. for i in args.layers],
            'dropout': dropout,
            'sigmoid': True,
            'loss_scale': args.loss_scale
        },
        mode='TRAIN' if args.mode == 'train' else 'EVAL')
    saver = tf.train.Saver()

    # Accuracy metric tensors
    hr_sum = tf.get_default_graph().get_tensor_by_name(
        'neumf/hit_rate/total:0')
    hr_cnt = tf.get_default_graph().get_tensor_by_name(
        'neumf/hit_rate/count:0')
    ndcg_sum = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/total:0')
    ndcg_cnt = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/count:0')

    # Prepare evaluation data
    data_generator.prepare_eval_data()

    if args.load_checkpoint_path:
        saver.restore(sess, args.load_checkpoint_path)
    else:
        # Manual initialize weights
        sess.run(tf.global_variables_initializer())

    # If test mode, run one eval
    if args.mode == 'test':
        sess.run(tf.local_variables_initializer())
        eval_start = time.time()
        for user_batch, item_batch, dup_batch \
            in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask):
            sess.run(eval_op,
                     feed_dict={
                         users: user_batch,
                         items: item_batch,
                         is_dup: dup_batch,
                         dropout: 0.0
                     })
        eval_duration = time.time() - eval_start

        # Report results
        hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False))
        hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False))
        ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False))
        ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False))

        hit_rate = hit_rate_sum / hit_rate_cnt
        ndcg = ndcg_sum / ndcg_cnt

        if hvd.rank() == 0:
            eval_throughput = pos_test_users.shape[0] * (args.valid_negative +
                                                         1) / eval_duration
            dllogger.log(step=tuple(),
                         data={
                             'eval_throughput': eval_throughput,
                             'eval_time': eval_duration,
                             'hr@10': hit_rate,
                             'ndcg': ndcg
                         })
        return

    # Performance Metrics
    train_times = list()
    eval_times = list()
    # Accuracy Metrics
    first_to_target = None
    time_to_train = 0.0
    best_hr = 0
    best_epoch = 0
    # Buffers for global metrics
    global_hr_sum = np.ones(1)
    global_hr_count = np.ones(1)
    global_ndcg_sum = np.ones(1)
    global_ndcg_count = np.ones(1)
    # Buffers for local metrics
    local_hr_sum = np.ones(1)
    local_hr_count = np.ones(1)
    local_ndcg_sum = np.ones(1)
    local_ndcg_count = np.ones(1)

    # Begin training
    begin_train = time.time()
    for epoch in range(args.epochs):
        # Train for one epoch
        train_start = time.time()
        data_generator.prepare_train_data()
        for user_batch, item_batch, label_batch \
            in zip(data_generator.train_users_batches,
                   data_generator.train_items_batches,
                   data_generator.train_labels_batches):
            sess.run(train_op,
                     feed_dict={
                         users: user_batch.get(),
                         items: item_batch.get(),
                         labels: label_batch.get()
                     })
        train_duration = time.time() - train_start
        # Only log "warm" epochs
        if epoch >= 1:
            train_times.append(train_duration)
        # Evaluate
        if epoch > args.eval_after:
            eval_start = time.time()
            sess.run(tf.local_variables_initializer())
            for user_batch, item_batch, dup_batch \
                in zip(data_generator.eval_users,
                       data_generator.eval_items,
                       data_generator.dup_mask):
                sess.run(eval_op,
                         feed_dict={
                             users: user_batch,
                             items: item_batch,
                             is_dup: dup_batch,
                             dropout: 0.0
                         })
            # Compute local metrics
            local_hr_sum[0] = sess.run(hr_sum)
            local_hr_count[0] = sess.run(hr_cnt)
            local_ndcg_sum[0] = sess.run(ndcg_sum)
            local_ndcg_count[0] = sess.run(ndcg_cnt)
            # Reduce metrics across all workers
            mpi_comm.Reduce(local_hr_count, global_hr_count)
            mpi_comm.Reduce(local_hr_sum, global_hr_sum)
            mpi_comm.Reduce(local_ndcg_count, global_ndcg_count)
            mpi_comm.Reduce(local_ndcg_sum, global_ndcg_sum)
            # Calculate metrics
            hit_rate = global_hr_sum[0] / global_hr_count[0]
            ndcg = global_ndcg_sum[0] / global_ndcg_count[0]

            eval_duration = time.time() - eval_start
            # Only log "warm" epochs
            if epoch >= 1:
                eval_times.append(eval_duration)

            if hvd.rank() == 0:
                dllogger.log(step=(epoch, ),
                             data={
                                 'train_time': train_duration,
                                 'eval_time': eval_duration,
                                 'hr@10': hit_rate,
                                 'ndcg': ndcg
                             })

                # Update summary metrics
                if hit_rate > args.target and first_to_target is None:
                    first_to_target = epoch
                    time_to_train = time.time() - begin_train
                if hit_rate > best_hr:
                    best_hr = hit_rate
                    best_epoch = epoch
                    time_to_best = time.time() - begin_train
                    if hit_rate > args.target:
                        saver.save(sess, final_checkpoint_path)

    # Final Summary
    if hvd.rank() == 0:
        train_times = np.array(train_times)
        train_throughputs = pos_train_users.shape[0] * (args.negative_samples +
                                                        1) / train_times
        eval_times = np.array(eval_times)
        eval_throughputs = pos_test_users.shape[0] * (args.valid_negative +
                                                      1) / eval_times

        dllogger.log(step=tuple(),
                     data={
                         'average_train_time_per_epoch': np.mean(train_times),
                         'average_train_throughput':
                         np.mean(train_throughputs),
                         'average_eval_time_per_epoch': np.mean(eval_times),
                         'average_eval_throughput': np.mean(eval_throughputs),
                         'first_epoch_to_hit': first_to_target,
                         'time_to_train': time_to_train,
                         'time_to_best': time_to_best,
                         'best_hr': best_hr,
                         'best_epoch': best_epoch
                     })
        dllogger.flush()

    sess.close()
    return
def main(args, model_args):
    exp_start_time = time.time()
    global best_prec1
    best_prec1 = 0

    args.distributed = False
    if "WORLD_SIZE" in os.environ:
        args.distributed = int(os.environ["WORLD_SIZE"]) > 1
        args.local_rank = int(os.environ["LOCAL_RANK"])
    else:
        args.local_rank = 0

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend="nccl", init_method="env://")
        args.world_size = torch.distributed.get_world_size()

    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed + args.local_rank)
        torch.cuda.manual_seed(args.seed + args.local_rank)
        np.random.seed(seed=args.seed + args.local_rank)
        random.seed(args.seed + args.local_rank)

        def _worker_init_fn(id):
            np.random.seed(seed=args.seed + args.local_rank + id)
            random.seed(args.seed + args.local_rank + id)

    else:

        def _worker_init_fn(id):
            pass

    if args.static_loss_scale != 1.0:
        if not args.amp:
            print(
                "Warning: if --amp is not used, static_loss_scale will be ignored."
            )

    if args.optimizer_batch_size < 0:
        batch_size_multiplier = 1
    else:
        tbs = args.world_size * args.batch_size
        if args.optimizer_batch_size % tbs != 0:
            print(
                "Warning: simulated batch size {} is not divisible by actual batch size {}"
                .format(args.optimizer_batch_size, tbs))
        batch_size_multiplier = int(args.optimizer_batch_size / tbs)
        print("BSM: {}".format(batch_size_multiplier))

    start_epoch = 0
    # optionally resume from a checkpoint
    if args.resume is not None:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(args.gpu))
            start_epoch = checkpoint["epoch"]
            best_prec1 = checkpoint["best_prec1"]
            model_state = checkpoint["state_dict"]
            optimizer_state = checkpoint["optimizer"]
            if "state_dict_ema" in checkpoint:
                model_state_ema = checkpoint["state_dict_ema"]
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint["epoch"]))
            if start_epoch >= args.epochs:
                print(
                    f"Launched training for {args.epochs}, checkpoint already run {start_epoch}"
                )
                exit(1)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            model_state = None
            model_state_ema = None
            optimizer_state = None
    else:
        model_state = None
        model_state_ema = None
        optimizer_state = None

    loss = nn.CrossEntropyLoss
    if args.mixup > 0.0:
        loss = lambda: NLLMultiLabelSmooth(args.label_smoothing)
    elif args.label_smoothing > 0.0:
        loss = lambda: LabelSmoothing(args.label_smoothing)

    memory_format = (torch.channels_last if args.memory_format == "nhwc" else
                     torch.contiguous_format)
    model = available_models()[args.arch](**{
        k: v if k != "pretrained" else v and (
            not args.distributed or dist.get_rank() == 0)
        for k, v in model_args.__dict__.items()
    })

    image_size = (args.image_size if args.image_size is not None else
                  model.arch.default_image_size)
    model_and_loss = ModelAndLoss(model,
                                  loss,
                                  cuda=True,
                                  memory_format=memory_format)
    if args.use_ema is not None:
        model_ema = deepcopy(model_and_loss)
        ema = EMA(args.use_ema)
    else:
        model_ema = None
        ema = None

    # Create data loaders and optimizers as needed
    if args.data_backend == "pytorch":
        get_train_loader = get_pytorch_train_loader
        get_val_loader = get_pytorch_val_loader
    elif args.data_backend == "dali-gpu":
        get_train_loader = get_dali_train_loader(dali_cpu=False)
        get_val_loader = get_dali_val_loader()
    elif args.data_backend == "dali-cpu":
        get_train_loader = get_dali_train_loader(dali_cpu=True)
        get_val_loader = get_dali_val_loader()
    elif args.data_backend == "syntetic":
        get_val_loader = get_syntetic_loader
        get_train_loader = get_syntetic_loader
    else:
        print("Bad databackend picked")
        exit(1)

    train_loader, train_loader_len = get_train_loader(
        args.data,
        image_size,
        args.batch_size,
        model_args.num_classes,
        args.mixup > 0.0,
        interpolation=args.interpolation,
        augmentation=args.augmentation,
        start_epoch=start_epoch,
        workers=args.workers,
        memory_format=memory_format,
    )
    if args.mixup != 0.0:
        train_loader = MixUpWrapper(args.mixup, train_loader)

    val_loader, val_loader_len = get_val_loader(
        args.data,
        image_size,
        args.batch_size,
        model_args.num_classes,
        False,
        interpolation=args.interpolation,
        workers=args.workers,
        memory_format=memory_format,
    )

    if not torch.distributed.is_initialized() or torch.distributed.get_rank(
    ) == 0:
        logger = log.Logger(
            args.print_freq,
            [
                dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT,
                                       step_format=log.format_step),
                dllogger.JSONStreamBackend(
                    dllogger.Verbosity.VERBOSE,
                    os.path.join(args.workspace, args.raport_file),
                ),
            ],
            start_epoch=start_epoch - 1,
        )

    else:
        logger = log.Logger(args.print_freq, [], start_epoch=start_epoch - 1)

    logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT)
    logger.log_parameter(
        {f"model.{k}": v
         for k, v in model_args.__dict__.items()},
        verbosity=dllogger.Verbosity.DEFAULT,
    )

    optimizer = get_optimizer(
        list(model_and_loss.model.named_parameters()),
        args.lr,
        args=args,
        state=optimizer_state,
    )

    if args.lr_schedule == "step":
        lr_policy = lr_step_policy(args.lr, [30, 60, 80],
                                   0.1,
                                   args.warmup,
                                   logger=logger)
    elif args.lr_schedule == "cosine":
        lr_policy = lr_cosine_policy(args.lr,
                                     args.warmup,
                                     args.epochs,
                                     end_lr=args.end_lr,
                                     logger=logger)
    elif args.lr_schedule == "linear":
        lr_policy = lr_linear_policy(args.lr,
                                     args.warmup,
                                     args.epochs,
                                     logger=logger)

    scaler = torch.cuda.amp.GradScaler(
        init_scale=args.static_loss_scale,
        growth_factor=2,
        backoff_factor=0.5,
        growth_interval=100 if args.dynamic_loss_scale else 1000000000,
        enabled=args.amp,
    )

    if args.distributed:
        model_and_loss.distributed(args.gpu)

    model_and_loss.load_model_state(model_state)
    if (ema is not None) and (model_state_ema is not None):
        print("load ema")
        ema.load_state_dict(model_state_ema)

    train_loop(
        model_and_loss,
        optimizer,
        scaler,
        lr_policy,
        train_loader,
        val_loader,
        logger,
        should_backup_checkpoint(args),
        ema=ema,
        model_ema=model_ema,
        steps_per_epoch=train_loader_len,
        use_amp=args.amp,
        batch_size_multiplier=batch_size_multiplier,
        start_epoch=start_epoch,
        end_epoch=min((start_epoch + args.run_epochs), args.epochs)
        if args.run_epochs != -1 else args.epochs,
        early_stopping_patience=args.early_stopping_patience,
        best_prec1=best_prec1,
        prof=args.prof,
        skip_training=args.evaluate,
        skip_validation=args.training_only,
        save_checkpoints=args.save_checkpoints and not args.evaluate,
        checkpoint_dir=args.workspace,
        checkpoint_filename=args.checkpoint_filename,
    )
    exp_duration = time.time() - exp_start_time
    if not torch.distributed.is_initialized() or torch.distributed.get_rank(
    ) == 0:
        logger.end()
    print("Experiment ended")
Beispiel #24
0
def prepare_for_training(args, model_args, model_arch):
    args.distributed = False
    if "WORLD_SIZE" in os.environ:
        args.distributed = int(os.environ["WORLD_SIZE"]) > 1
        args.local_rank = int(os.environ["LOCAL_RANK"])
    else:
        args.local_rank = 0

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend="nccl", init_method="env://")
        args.world_size = torch.distributed.get_world_size()

    affinity = set_affinity(args.gpu, mode=args.gpu_affinity)
    print(f"Training process {args.local_rank} affinity: {affinity}")

    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed + args.local_rank)
        torch.cuda.manual_seed(args.seed + args.local_rank)
        np.random.seed(seed=args.seed + args.local_rank)
        random.seed(args.seed + args.local_rank)

        def _worker_init_fn(id):
            # Worker process should inherit its affinity from parent
            affinity = os.sched_getaffinity(0) 
            print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}")

            np.random.seed(seed=args.seed + args.local_rank + id)
            random.seed(args.seed + args.local_rank + id)

    else:

        def _worker_init_fn(id):
            # Worker process should inherit its affinity from parent
            affinity = os.sched_getaffinity(0)
            print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}")

    if args.static_loss_scale != 1.0:
        if not args.amp:
            print("Warning: if --amp is not used, static_loss_scale will be ignored.")

    if args.optimizer_batch_size < 0:
        batch_size_multiplier = 1
    else:
        tbs = args.world_size * args.batch_size
        if args.optimizer_batch_size % tbs != 0:
            print(
                "Warning: simulated batch size {} is not divisible by actual batch size {}".format(
                    args.optimizer_batch_size, tbs
                )
            )
        batch_size_multiplier = int(args.optimizer_batch_size / tbs)
        print("BSM: {}".format(batch_size_multiplier))

    start_epoch = 0
    # optionally resume from a checkpoint
    if args.resume is not None:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)
            )
            start_epoch = checkpoint["epoch"]
            best_prec1 = checkpoint["best_prec1"]
            model_state = checkpoint["state_dict"]
            optimizer_state = checkpoint["optimizer"]
            if "state_dict_ema" in checkpoint:
                model_state_ema = checkpoint["state_dict_ema"]
            print(
                "=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint["epoch"]
                )
            )
            if start_epoch >= args.epochs:
                print(
                    f"Launched training for {args.epochs}, checkpoint already run {start_epoch}"
                )
                exit(1)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            model_state = None
            model_state_ema = None
            optimizer_state = None
    else:
        model_state = None
        model_state_ema = None
        optimizer_state = None

    loss = nn.CrossEntropyLoss
    if args.mixup > 0.0:
        loss = lambda: NLLMultiLabelSmooth(args.label_smoothing)
    elif args.label_smoothing > 0.0:
        loss = lambda: LabelSmoothing(args.label_smoothing)

    memory_format = (
        torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format
    )
    model = model_arch(
        **{
            k: v
            if k != "pretrained"
            else v and (not args.distributed or dist.get_rank() == 0)
            for k, v in model_args.__dict__.items()
        }
    )

    image_size = (
        args.image_size
        if args.image_size is not None
        else model.arch.default_image_size
    )

    scaler = torch.cuda.amp.GradScaler(
        init_scale=args.static_loss_scale,
        growth_factor=2,
        backoff_factor=0.5,
        growth_interval=100 if args.dynamic_loss_scale else 1000000000,
        enabled=args.amp,
    )

    executor = Executor(
        model,
        loss(),
        cuda=True,
        memory_format=memory_format,
        amp=args.amp,
        scaler=scaler,
        divide_loss=batch_size_multiplier,
        ts_script=args.jit == "script",
    )

    # Create data loaders and optimizers as needed
    if args.data_backend == "pytorch":
        get_train_loader = get_pytorch_train_loader
        get_val_loader = get_pytorch_val_loader
    elif args.data_backend == "dali-gpu":
        get_train_loader = get_dali_train_loader(dali_cpu=False)
        get_val_loader = get_dali_val_loader()
    elif args.data_backend == "dali-cpu":
        get_train_loader = get_dali_train_loader(dali_cpu=True)
        get_val_loader = get_dali_val_loader()
    elif args.data_backend == "syntetic":
        get_val_loader = get_syntetic_loader
        get_train_loader = get_syntetic_loader
    else:
        print("Bad databackend picked")
        exit(1)

    train_loader, train_loader_len = get_train_loader(
        args.data,
        image_size,
        args.batch_size,
        model_args.num_classes,
        args.mixup > 0.0,
        interpolation=args.interpolation,
        augmentation=args.augmentation,
        start_epoch=start_epoch,
        workers=args.workers,
        _worker_init_fn=_worker_init_fn,
        memory_format=memory_format,
        prefetch_factor=args.prefetch,
    )
    if args.mixup != 0.0:
        train_loader = MixUpWrapper(args.mixup, train_loader)

    val_loader, val_loader_len = get_val_loader(
        args.data,
        image_size,
        args.batch_size,
        model_args.num_classes,
        False,
        interpolation=args.interpolation,
        workers=args.workers,
        _worker_init_fn=_worker_init_fn,
        memory_format=memory_format,
        prefetch_factor=args.prefetch,
    )

    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
        logger = log.Logger(
            args.print_freq,
            [
                dllogger.StdOutBackend(
                    dllogger.Verbosity.DEFAULT, step_format=log.format_step
                ),
                dllogger.JSONStreamBackend(
                    dllogger.Verbosity.VERBOSE,
                    os.path.join(args.workspace, args.raport_file),
                ),
            ],
            start_epoch=start_epoch - 1,
        )

    else:
        logger = log.Logger(args.print_freq, [], start_epoch=start_epoch - 1)

    logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT)
    logger.log_parameter(
        {f"model.{k}": v for k, v in model_args.__dict__.items()},
        verbosity=dllogger.Verbosity.DEFAULT,
    )

    optimizer = get_optimizer(
        list(executor.model.named_parameters()),
        args.lr,
        args=args,
        state=optimizer_state,
    )

    if args.lr_schedule == "step":
        lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup)
    elif args.lr_schedule == "cosine":
        lr_policy = lr_cosine_policy(
            args.lr, args.warmup, args.epochs, end_lr=args.end_lr
        )
    elif args.lr_schedule == "linear":
        lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs)

    if args.distributed:
        executor.distributed(args.gpu)

    if model_state is not None:
        executor.model.load_state_dict(model_state)

    trainer = Trainer(
        executor,
        optimizer,
        grad_acc_steps=batch_size_multiplier,
        ema=args.use_ema,
    )

    if (args.use_ema is not None) and (model_state_ema is not None):
        trainer.ema_executor.model.load_state_dict(model_state_ema)

    return (
        trainer,
        lr_policy,
        train_loader,
        train_loader_len,
        val_loader,
        logger,
        start_epoch,
    )
Beispiel #25
0
def main(FLAGS):
    if FLAGS.hvd:
        hvd.init()
        if hvd.local_rank() == 0:
            tf.logging.set_verbosity(tf.logging.INFO)
            log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
            os.makedirs(FLAGS.results_dir, exist_ok=True)
            dllogger.init(backends=[
                dllogger.JSONStreamBackend(
                    verbosity=dllogger.Verbosity.VERBOSE, filename=log_path),
                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
            ])
        else:
            tf.logging.set_verbosity(tf.logging.ERROR)
            dllogger.init(backends=[])
        num_gpus = hvd.size()
    else:
        tf.logging.set_verbosity(tf.logging.INFO)
        log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
        os.makedirs(FLAGS.results_dir, exist_ok=True)
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
        num_gpus = 1

    dllogger.log(data=vars(FLAGS), step='PARAMETER')

    create_batches = FLAGS.batch_size // FLAGS.prebatch_size

    wide_columns, deep_columns = get_feature_columns(
        use_all_columns=FLAGS.use_all_columns)
    tf_transform_output = tft.TFTransformOutput(
        FLAGS.transformed_metadata_path)

    if not FLAGS.hvd or hvd.local_rank() == 0:
        tf.compat.v1.logging.warn('command line arguments: {}'.format(
            json.dumps(vars(FLAGS))))
        if not os.path.exists(FLAGS.results_dir):
            os.mkdir(FLAGS.results_dir)

        with open('{}/args.json'.format(FLAGS.results_dir), 'w') as f:
            json.dump(vars(FLAGS), f, indent=4)

    if FLAGS.gpu:
        session_config = tf.compat.v1.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)
    else:
        session_config = tf.compat.v1.ConfigProto(
            device_count={'GPU': 0},
            log_device_placement=FLAGS.log_device_placement)

    if FLAGS.hvd:
        session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    if FLAGS.xla:
        session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    if FLAGS.benchmark:
        model_dir = None
    else:
        model_dir = FLAGS.model_dir

    if FLAGS.save_checkpoints_steps != 0:
        run_config = tf.estimator.RunConfig(model_dir=model_dir).replace(
            session_config=session_config,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            keep_checkpoint_max=1)
    else:
        run_config = tf.estimator.RunConfig(model_dir=model_dir).replace(
            session_config=session_config,
            save_checkpoints_secs=FLAGS.save_checkpoints_secs,
            keep_checkpoint_max=1)

    wide_optimizer = tf.compat.v1.train.FtrlOptimizer(
        learning_rate=FLAGS.linear_learning_rate,
        l1_regularization_strength=FLAGS.linear_l1_regularization,
        l2_regularization_strength=FLAGS.linear_l2_regularization)

    deep_optimizer = tf.compat.v1.train.ProximalAdagradOptimizer(
        learning_rate=FLAGS.deep_learning_rate,
        initial_accumulator_value=0.1,
        l1_regularization_strength=FLAGS.deep_l1_regularization,
        l2_regularization_strength=FLAGS.deep_l2_regularization,
        use_locking=False)

    if FLAGS.hvd:
        wide_optimizer = hvd.DistributedOptimizer(wide_optimizer)
        deep_optimizer = hvd.DistributedOptimizer(deep_optimizer)

    stats_filename = os.path.join(FLAGS.transformed_metadata_path,
                                  'stats.json')
    embed_columns = None

    # input functions to read data from disk
    train_input_fn = lambda: separate_input_fn(
        tf_transform_output,
        FLAGS.train_data_pattern,
        create_batches,
        tf.estimator.ModeKeys.TRAIN,
        reader_num_threads=FLAGS.reader_num_threads,
        parser_num_threads=FLAGS.parser_num_threads,
        shuffle_buffer_size=int(FLAGS.shuffle_percentage * create_batches),
        prefetch_buffer_size=FLAGS.prefetch_buffer_size,
        print_display_ids=FLAGS.print_display_ids)
    eval_input_fn = lambda: separate_input_fn(
        tf_transform_output,
        FLAGS.eval_data_pattern,
        (FLAGS.eval_batch_size // FLAGS.prebatch_size),
        tf.estimator.ModeKeys.EVAL,
        reader_num_threads=1,
        parser_num_threads=1,
        shuffle_buffer_size=int(FLAGS.shuffle_percentage * create_batches),
        prefetch_buffer_size=FLAGS.prefetch_buffer_size,
        print_display_ids=FLAGS.print_display_ids)

    estimator = construct_estimator(FLAGS.model_type,
                                    not FLAGS.canned_estimator,
                                    run_config,
                                    wide_columns,
                                    wide_optimizer,
                                    deep_columns,
                                    FLAGS.deep_hidden_units,
                                    FLAGS.deep_dropout,
                                    deep_optimizer,
                                    amp=FLAGS.amp)

    estimator = tf.estimator.add_metrics(estimator, map_custom_metric)
    estimator = tf.estimator.add_metrics(estimator,
                                         map_custom_metric_with_leak)

    steps_per_epoch = FLAGS.training_set_size / FLAGS.batch_size

    print('Steps per epoch: {}'.format(steps_per_epoch))
    max_steps = int(FLAGS.num_epochs * steps_per_epoch)

    hooks = []
    if FLAGS.hvd:
        hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if FLAGS.predict or FLAGS.evaluate:  # inference
        if FLAGS.benchmark:
            benchmark_hook = BenchmarkLoggingHook(
                global_batch_size=num_gpus * FLAGS.eval_batch_size,
                warmup_steps=FLAGS.benchmark_warmup_steps)
            hooks.append(benchmark_hook)
            eval_steps = FLAGS.benchmark_steps
        else:
            eval_steps = FLAGS.eval_steps

        predict_result_iter = estimator.predict(input_fn=eval_input_fn,
                                                hooks=hooks,
                                                yield_single_examples=False)

        results = []
        for i, r in enumerate(predict_result_iter):
            print('predicting batch: ', i)
            results.append(r)
            # TODO: use eval_steps
            if i >= eval_steps - 1:
                break

        if FLAGS.benchmark:
            infer_throughput = benchmark_hook.mean_throughput.value()

        if FLAGS.benchmark:
            dllogger.log(data={'infer_throughput': infer_throughput},
                         step=tuple())
        elif FLAGS.evaluate:
            print(
                'evaluating using estimator.evaluate with eval_batch_size = ',
                FLAGS.eval_batch_size, ' and eval_steps = ', FLAGS.eval_steps)

            result = estimator.evaluate(eval_input_fn,
                                        hooks=hooks,
                                        steps=FLAGS.eval_steps)
            dllogger.log(step=(),
                         data={
                             'map_infer': float(result['map']),
                             'map_with_leak_infer':
                             float(result['map_with_leak'])
                         })
        elif FLAGS.predict:
            scores = [r['probabilities'][:, 1] for r in results]
            scores = np.hstack(scores)
            scores_path = os.path.join(FLAGS.model_dir, 'scores.txt')
            print('saving the numpy scores array to: ', scores_path)
            np.savetxt(scores_path, scores, fmt="%f", delimiter='\n')

    else:  # training

        if FLAGS.benchmark:
            benchmark_hook = BenchmarkLoggingHook(
                global_batch_size=num_gpus * FLAGS.batch_size,
                warmup_steps=FLAGS.benchmark_warmup_steps)
            hooks.append(benchmark_hook)
            estimator.train(train_input_fn,
                            hooks=hooks,
                            steps=FLAGS.benchmark_steps)
            train_throughput = benchmark_hook.mean_throughput.value()
            dllogger.log(data={'train_throughput': train_throughput},
                         step=tuple())
        else:
            train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                                max_steps=max_steps,
                                                hooks=hooks)
            eval_spec = tf.estimator.EvalSpec(
                input_fn=eval_input_fn,
                throttle_secs=FLAGS.eval_throttle_secs,
                steps=FLAGS.eval_steps)
            result = tf.estimator.train_and_evaluate(estimator, train_spec,
                                                     eval_spec)

            if result:
                dllogger.log(step=(),
                             data={
                                 'map': float(result[0]['map']),
                                 'map_with_leak':
                                 float(result[0]['map_with_leak'])
                             })
Beispiel #26
0
def main():
    args = parse_args()
    init_distributed(args)

    if args.local_rank == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.metadata('train_throughput', {
        "name": 'train_throughput',
        'format': ":.3e"
    })
    dllogger.metadata('hr@10', {"name": 'hr@10', 'format': ":.5f"})
    dllogger.metadata('train_epoch_time', {
        "name": 'train_epoch_time',
        'format': ":.3f"
    })
    dllogger.metadata('validation_epoch_time', {
        "name": 'validation_epoch_time',
        'format': ":.3f"
    })
    dllogger.metadata('eval_throughput', {
        "name": 'eval_throughput',
        'format': ":.3e"
    })

    dllogger.log(data=vars(args), step='PARAMETER')

    if args.seed is not None:
        torch.manual_seed(args.seed)

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir:
        print("Saving results to {}".format(args.checkpoint_dir))
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()

    feature_spec_path = os.path.join(args.data, args.feature_spec_file)
    feature_spec = FeatureSpec.from_yaml(feature_spec_path)
    trainset = dataloading.TorchTensorDataset(feature_spec,
                                              mapping_name='train',
                                              args=args)
    testset = dataloading.TorchTensorDataset(feature_spec,
                                             mapping_name='test',
                                             args=args)
    train_loader = dataloading.TrainDataloader(trainset, args)
    test_loader = dataloading.TestDataLoader(testset, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
    user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0]
    item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0]
    label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0]
    model = NeuMF(
        nb_users=feature_spec.feature_spec[user_feature_name]['cardinality'],
        nb_items=feature_spec.feature_spec[item_feature_name]['cardinality'],
        mf_dim=args.factors,
        mlp_layer_sizes=args.layers,
        dropout=args.dropout)

    optimizer = FusedAdam(model.parameters(),
                          lr=args.learning_rate,
                          betas=(args.beta1, args.beta2),
                          eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O2",
                                          keep_batchnorm_fp32=False,
                                          loss_scale='dynamic')

    if args.distributed:
        model = DDP(model)

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_loader,
                             args.topk,
                             distributed=args.distributed)
        val_time = time.time() - start
        eval_size = test_loader.raw_dataset_length
        eval_throughput = eval_size / val_time

        dllogger.log(step=tuple(),
                     data={
                         'best_eval_throughput': eval_throughput,
                         'hr@10': hr
                     })
        return

    # this should always be overridden if hr>0.
    # It is theoretically possible for the hit rate to be zero in the first epoch, which would result in referring
    # to an uninitialized variable.
    max_hr = 0
    best_epoch = 0
    best_model_timestamp = time.time()
    train_throughputs, eval_throughputs = [], []

    for epoch in range(args.epochs):

        begin = time.time()
        batch_dict_list = train_loader.get_epoch_data()
        num_batches = len(batch_dict_list)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                batch_dict = batch_dict_list[batch_idx]

                user_features = batch_dict[USER_CHANNEL_NAME]
                item_features = batch_dict[ITEM_CHANNEL_NAME]

                user_batch = user_features[user_feature_name]
                item_batch = item_features[item_feature_name]

                label_features = batch_dict[LABEL_CHANNEL_NAME]
                label_batch = label_features[label_feature_name]

                outputs = model(user_batch, item_batch)
                loss = traced_criterion(outputs, label_batch.view(-1,
                                                                  1)).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.amp:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
            optimizer.step()

            for p in model.parameters():
                p.grad = None

        del batch_dict_list
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = train_loader.length_after_augmentation
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)

        hr, ndcg = val_epoch(model,
                             test_loader,
                             args.topk,
                             distributed=args.distributed)

        val_time = time.time() - begin
        eval_size = test_loader.raw_dataset_length
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        dllogger.log(step=(epoch, ),
                     data={
                         'train_throughput': train_throughput,
                         'hr@10': hr,
                         'train_epoch_time': train_time,
                         'validation_epoch_time': val_time,
                         'eval_throughput': eval_throughput
                     })

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            best_epoch = epoch
            print("New best hr!")
            if args.checkpoint_dir:
                save_checkpoint_path = os.path.join(args.checkpoint_dir,
                                                    'model.pth')
                print("Saving the model to: ", save_checkpoint_path)
                torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                break

    if args.local_rank == 0:
        dllogger.log(data={
            'best_train_throughput':
            max(train_throughputs),
            'best_eval_throughput':
            max(eval_throughputs),
            'mean_train_throughput':
            np.mean(train_throughputs),
            'mean_eval_throughput':
            np.mean(eval_throughputs),
            'best_accuracy':
            max_hr,
            'best_epoch':
            best_epoch,
            'time_to_target':
            time.time() - main_start_time,
            'time_to_best_model':
            best_model_timestamp - main_start_time
        },
                     step=tuple())
def main():
    args = parse_args()

    hvd.init()
    set_affinity(hvd.local_rank())

    if is_main_process():
        log("Running total processes: {}".format(get_world_size()))
    log("Starting process: {}".format(get_rank()))

    if is_main_process():
        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                                           filename=args.json_summary),
                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
    else:
        dllogger.init(backends=[])

    tf.random.set_seed(args.seed)
    dllogger.log(step="PARAMETER", data={"SEED": args.seed})
    # script parameters
    BATCH_SIZE = args.train_batch_size
    EVAL_BATCH_SIZE = args.predict_batch_size
    USE_XLA = args.xla
    USE_AMP = args.amp
    EPOCHS = args.num_train_epochs

    if not args.do_train:
        EPOCHS = args.num_train_epochs = 1
        log("Since running inference only, setting args.num_train_epochs to 1")

    if not os.path.exists(args.output_dir) and is_main_process():
        os.makedirs(args.output_dir)

    # TensorFlow configuration
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
    tf.config.optimizer.set_jit(USE_XLA)
    #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
    
    if args.amp:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
        tf.keras.mixed_precision.experimental.set_policy(policy)
        print('Compute dtype: %s' % policy.compute_dtype)  # Compute dtype: float16
        print('Variable dtype: %s' % policy.variable_dtype)  # Variable dtype: float32

    if is_main_process():
        log("***** Loading tokenizer and model *****")
    # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
    electra_model = args.electra_model
    config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir)
    config.update({"amp": args.amp})
    if args.vocab_file is None:
        tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir)
    else:
        tokenizer = ElectraTokenizer(
            vocab_file=args.vocab_file,
            do_lower_case=args.do_lower_case)

    model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, cache_dir=args.cache_dir, args=args)

    if is_main_process():
        log("***** Loading dataset *****")
    # Load data
    processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
    train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None
    dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None

    if is_main_process():
        log("***** Loading features *****")
    # Load cached features
    squad_version = '2.0' if args.version_2_with_negative else '1.1'
    if args.cache_dir is None:
        args.cache_dir = args.data_dir
    cached_train_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_train-v{4}.json_{1}_{2}_{3}'.format(
        electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
        str(args.max_query_length), squad_version)
    cached_dev_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_dev-v{4}.json_{1}_{2}_{3}'.format(
        electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
        str(args.max_query_length), squad_version)

    try:
        with open(cached_train_features_file, "rb") as reader:
            train_features = pickle.load(reader) if args.do_train else []
        with open(cached_dev_features_file, "rb") as reader:
            dev_features = pickle.load(reader) if args.do_predict else []
    except:
        train_features = (  # TODO: (yy) do on rank 0?
            squad_convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True,
                return_dataset="",
            )
            if args.do_train
            else []
        )
        dev_features = (
            squad_convert_examples_to_features(
                examples=dev_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=False,
                return_dataset="",
            )
            if args.do_predict
            else []
        )
        # Dump Cached features
        if not args.skip_cache and is_main_process():
            if args.do_train:
                log("***** Building Cache Files: {} *****".format(cached_train_features_file))
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)
            if args.do_predict:
                log("***** Building Cache Files: {} *****".format(cached_dev_features_file))
                with open(cached_dev_features_file, "wb") as writer:
                    pickle.dump(dev_features, writer)

    len_train_features = len(train_features)
    total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1
    train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1
    len_dev_features = len(dev_features)
    total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1

    train_dataset = get_dataset_from_features(train_features, BATCH_SIZE,
                                              v2=args.version_2_with_negative) if args.do_train else []
    dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev",
                                            v2=args.version_2_with_negative) if args.do_predict else []

    opt = create_optimizer(init_lr=args.learning_rate, num_train_steps=total_train_steps,
                           num_warmup_steps=int(args.warmup_proportion * total_train_steps),
                           weight_decay_rate=args.weight_decay_rate,
                           layerwise_lr_decay=args.layerwise_lr_decay,
                           n_transformer_layers=model.num_hidden_layers)
    if USE_AMP:
        # loss scaling is currently required when using mixed precision
        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")

    # Define loss function
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    loss_class = tf.keras.losses.BinaryCrossentropy(
        from_logits=True,
        name='binary_crossentropy'
    )
    metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
    model.compile(optimizer=opt, loss=loss, metrics=[metric])
    train_loss_results = []

    if args.do_train and is_main_process():
        log("***** Running training *****")
        log("  Num examples = ", len_train_features)
        log("  Num Epochs = ", args.num_train_epochs)
        log("  Instantaneous batch size per GPU = ", args.train_batch_size)
        log(
            "  Total train batch size (w. parallel, distributed & accumulation) = ",
            args.train_batch_size
            * get_world_size(),
        )
        log("  Total optimization steps =", total_train_steps)

    total_train_time = 0
    latency = []
    for epoch in range(EPOCHS):
        if args.do_train:
            epoch_loss_avg = tf.keras.metrics.Mean()
            epoch_perf_avg = tf.keras.metrics.Mean()
            epoch_start = time.time()

            epoch_iterator = tqdm(train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5,
                                  disable=not is_main_process())
            for iter, inputs in enumerate(epoch_iterator):
                # breaking criterion if max_steps if > 1
                if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps:
                    break
                iter_start = time.time()
                # Optimize the model
                loss_value = train_step(model, inputs, loss, USE_AMP, opt, (iter == 0 and epoch == 0),
                                        v2=args.version_2_with_negative, loss_class=loss_class, fp16=USE_AMP)
                epoch_perf_avg.update_state(1. * BATCH_SIZE / (time.time() - iter_start))
                if iter % args.log_freq == 0:
                    if is_main_process():
                        log("\nEpoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}, loss_scale:{}, opt_step:{}".format(epoch, iter, loss_value,
                                                                                              epoch_perf_avg.result() * get_world_size(), opt.loss_scale if config.amp else 1,
                                                                                              int(opt.iterations)))
                    dllogger.log(step=(epoch, iter,), data={"step_loss": float(loss_value.numpy()),
                                                            "train_perf": float( epoch_perf_avg.result().numpy() * get_world_size())})

                # Track progress
                epoch_loss_avg.update_state(loss_value)  # Add current batch loss

            # End epoch
            train_loss_results.append(epoch_loss_avg.result())
            total_train_time += float(time.time() - epoch_start)
            # Summarize and save checkpoint at the end of each epoch
            if is_main_process():

                dllogger.log(step=tuple(), data={"e2e_train_time": total_train_time,
                                                 "training_sequences_per_second": float(
                                                     epoch_perf_avg.result().numpy() * get_world_size()),
                                                 "final_loss": float(epoch_loss_avg.result().numpy())})

            if not args.skip_checkpoint:
                if args.ci:
                    checkpoint_name = "{}/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.output_dir, args.version_2_with_negative, epoch + 1)
                else:
                    checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.version_2_with_negative, epoch + 1)
                if is_main_process():
                    model.save_weights(checkpoint_name)


        if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1):
            if not args.do_train:
                log("***** Loading checkpoint: {} *****".format(args.init_checkpoint))
                model.load_weights(args.init_checkpoint).expect_partial()

            current_feature_id = 0
            all_results = []
            if is_main_process():
                log("***** Running evaluation *****")
                log("  Num Batches = ", total_dev_steps)
                log("  Batch size = ", args.predict_batch_size)

            raw_infer_start = time.time()
            if is_main_process():
                infer_perf_avg = tf.keras.metrics.Mean()
                dev_iterator = tqdm(dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5,
                                    disable=not is_main_process())
                for input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible in dev_iterator:
                    # training=False is needed only if there are layers with different
                    # behavior during training versus inference (e.g. Dropout).

                    iter_start = time.time()

                    if not args.joint_head:
                        batch_start_logits, batch_end_logits = infer_step(model, input_ids,
                                                                          attention_mask=input_mask,
                                                                          token_type_ids=segment_ids,
                                                                          )[:2]
                        #Synchronize with GPU to compute time
                        _ = batch_start_logits.numpy()
                                                            
                    else:
                        
                        outputs = infer_step(model, input_ids,
                                             attention_mask=input_mask,
                                             token_type_ids=segment_ids,
                                             cls_index=cls_index,
                                             p_mask=p_mask,
                                             )
                        #Synchronize with GPU to compute time
                        _ = outputs[0].numpy()

                    infer_time = (time.time() - iter_start)
                    infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time)
                    latency.append(infer_time)

                    for iter_ in range(input_ids.shape[0]):

                        if not args.joint_head:
                            start_logits = batch_start_logits[iter_].numpy().tolist()
                            end_logits = batch_end_logits[iter_].numpy().tolist()
                            dev_feature = dev_features[current_feature_id]
                            current_feature_id += 1
                            unique_id = int(dev_feature.unique_id)
                            all_results.append(RawResult(unique_id=unique_id,
                                                         start_logits=start_logits,
                                                         end_logits=end_logits))
                        else:
                            dev_feature = dev_features[current_feature_id]
                            current_feature_id += 1
                            unique_id = int(dev_feature.unique_id)
                            output = [output[iter_].numpy().tolist() for output in outputs]

                            start_logits = output[0]
                            start_top_index = output[1]
                            end_logits = output[2]
                            end_top_index = output[3]
                            cls_logits = output[4]
                            result = SquadResult(
                                unique_id,
                                start_logits,
                                end_logits,
                                start_top_index=start_top_index,
                                end_top_index=end_top_index,
                                cls_logits=cls_logits,
                            )

                            all_results.append(result)

                # Compute and save predictions
                answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args)

                output_prediction_file = os.path.join(args.output_dir, "predictions.json")
                output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
                e2e_infer_time = time.time() - raw_infer_start
                # if args.version_2_with_negative:
                #     output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
                # else:
                #     output_null_log_odds_file = None
                with open(output_prediction_file, "w") as f:
                    f.write(json.dumps(answers, indent=4) + "\n")
                with open(output_nbest_file, "w") as f:
                    f.write(json.dumps(nbest_answers, indent=4) + "\n")

                if args.do_eval:
                    if args.version_2_with_negative:
                        dev_file = "dev-v2.0.json"
                    else:
                        dev_file = "dev-v1.1.json"

                    eval_out = subprocess.check_output([sys.executable, args.eval_script,
                                                        args.data_dir + "/" + dev_file, output_prediction_file])
                    log(eval_out.decode('UTF-8'))
                    scores = str(eval_out).strip()
                    exact_match = float(scores.split(":")[1].split(",")[0])
                    if args.version_2_with_negative:
                        f1 = float(scores.split(":")[2].split(",")[0])
                    else:
                        f1 = float(scores.split(":")[2].split("}")[0])

                    log("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode('UTF-8')))
                    log("**EVAL SUMMARY** - Epoch: {:03d},  EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s"
                          .format(epoch, exact_match, f1, infer_perf_avg.result()))

                latency_all = sorted(latency)[:-2]
                log(
                    "**LATENCY SUMMARY** - Epoch: {:03d},  Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms"
                    .format(epoch, sum(latency_all) / len(latency_all) * 1000,
                            sum(latency_all[:int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000,
                            sum(latency_all[:int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000,
                            sum(latency_all[:int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000,
                            ))
                dllogger.log(step=tuple(),
                             data={"inference_sequences_per_second": float(infer_perf_avg.result().numpy()), 
                                   "e2e_inference_time": e2e_infer_time})

    if is_main_process() and args.do_train and args.do_eval:
        log(
            "**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s"
            .format(exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(),
                    infer_perf_avg.result()))
        dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
def main():

    run = Run.get_context()
    workspace = run.experiment.workspace

    # First thing to do is try to set up from environment
    configure_nccl_settings_from_env()

    parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=os.getenv("LOCAL_RANK", 0))
    parser.add_argument(
        "--max_steps",
        type=int,
        default=0,
        help="Override number of training steps in the config",
    )
    parser.add_argument("--dataset", type=str, required=True)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument("--fp16", help="Mixed precision training", action="store_true")
    parser.add_argument("--amp", help="Mixed precision training", action="store_true")
    parser.add_argument(
        "--skip_checkpoint",
        default=False,
        action="store_true",
        help="Whether to save checkpoints",
    )
    parser.add_argument(
        "--json-summary",
        help="Out file for DLLogger",
        default="dllogger.out",
        type=str,
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )
    args = parser.parse_args()
    args.fp16 = args.fp16 or args.amp

    num_gpus = get_global_size()
    args.distributed = num_gpus > 1
    args.local_rank = get_local_rank()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)

    # Redundant option - Override config parameter with command line input
    if args.max_steps > 0:
        cfg.SOLVER.MAX_ITER = args.max_steps

    if args.skip_checkpoint:
        cfg.SAVE_CHECKPOINT = False

    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    if is_main_process():
        dllogger.init(
            backends=[
                dllogger.JSONStreamBackend(
                    verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary
                ),
                dllogger.StdOutBackend(
                    verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step
                ),
            ]
        )
    else:
        dllogger.init(backends=[])

    dllogger.log(step="PARAMETER", data={"gpu_count": num_gpus})
    # dllogger.log(step="PARAMETER", data={"environment_info": collect_env_info()})
    dllogger.log(step="PARAMETER", data={"config_file": args.config_file})

    dllogger.log(step="PARAMETER", data={"config": cfg})

    if args.fp16:
        fp16 = True
    else:
        fp16 = False

    if args.local_rank == 0:
        dllogger.log(step="WEIGHT DOWNLOAD", data={"complete": False})
        download_weights(cfg.MODEL.WEIGHT, cfg.PATHS_CATALOG)
        dllogger.log(step="WEIGHT DOWNLOAD", data={"complete": True})

        dllogger.log(
            step="DATASET MOUNT", data={"complete": False, "dataset": args.dataset}
        )
        coco2017 = Dataset.get_by_name(workspace, args.dataset)
        cc2017mount = coco2017.mount("/data")
        cc2017mount.start()
        dllogger.log(
            step="DATASET MOUNT", data={"complete": True, "dataset": args.dataset}
        )

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl", init_method="env://")
        synchronize()

    model, iters_per_epoch = train(
        cfg, args.local_rank, args.distributed, fp16, dllogger
    )
Beispiel #29
0
def main(args):
    exp_start_time = time.time()
    global best_prec1
    best_prec1 = 0

    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1
        args.local_rank = int(os.environ['LOCAL_RANK'])

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend='nccl', init_method='env://')
        args.world_size = torch.distributed.get_world_size()

    if args.amp and args.fp16:
        print("Please use only one of the --fp16/--amp flags")
        exit(1)

    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed + args.local_rank)
        torch.cuda.manual_seed(args.seed + args.local_rank)
        np.random.seed(seed=args.seed + args.local_rank)
        random.seed(args.seed + args.local_rank)

        def _worker_init_fn(id):
            np.random.seed(seed=args.seed + args.local_rank + id)
            random.seed(args.seed + args.local_rank + id)
    else:

        def _worker_init_fn(id):
            pass

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    if args.static_loss_scale != 1.0:
        if not args.fp16:
            print(
                "Warning:  if --fp16 is not used, static_loss_scale will be ignored."
            )

    if args.optimizer_batch_size < 0:
        batch_size_multiplier = 1
    else:
        tbs = args.world_size * args.batch_size
        if args.optimizer_batch_size % tbs != 0:
            print(
                "Warning: simulated batch size {} is not divisible by actual batch size {}"
                .format(args.optimizer_batch_size, tbs))
        batch_size_multiplier = int(args.optimizer_batch_size / tbs)
        print("BSM: {}".format(batch_size_multiplier))

    pretrained_weights = None
    if args.pretrained_weights:
        if os.path.isfile(args.pretrained_weights):
            print("=> loading pretrained weights from '{}'".format(
                args.pretrained_weights))
            pretrained_weights = torch.load(args.pretrained_weights)
        else:
            print("=> no pretrained weights found at '{}'".format(args.resume))

    start_epoch = 0
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(args.gpu))
            start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model_state = checkpoint['state_dict']
            optimizer_state = checkpoint['optimizer']
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            model_state = None
            optimizer_state = None
    else:
        model_state = None
        optimizer_state = None

    loss = nn.CrossEntropyLoss
    if args.mixup > 0.0:
        loss = lambda: NLLMultiLabelSmooth(args.label_smoothing)
    elif args.label_smoothing > 0.0:
        loss = lambda: LabelSmoothing(args.label_smoothing)

    model_and_loss = ModelAndLoss((args.arch, args.model_config),
                                  loss,
                                  pretrained_weights=pretrained_weights,
                                  cuda=True,
                                  fp16=args.fp16)

    # Create data loaders and optimizers as needed
    if args.data_backend == 'pytorch':
        get_train_loader = get_pytorch_train_loader
        get_val_loader = get_pytorch_val_loader
    elif args.data_backend == 'dali-gpu':
        get_train_loader = get_dali_train_loader(dali_cpu=False)
        get_val_loader = get_dali_val_loader()
    elif args.data_backend == 'dali-cpu':
        get_train_loader = get_dali_train_loader(dali_cpu=True)
        get_val_loader = get_dali_val_loader()
    elif args.data_backend == 'syntetic':
        get_val_loader = get_syntetic_loader
        get_train_loader = get_syntetic_loader

    train_loader, train_loader_len = get_train_loader(args.data,
                                                      args.batch_size,
                                                      1000,
                                                      args.mixup > 0.0,
                                                      workers=args.workers,
                                                      fp16=args.fp16)
    if args.mixup != 0.0:
        train_loader = MixUpWrapper(args.mixup, 1000, train_loader)

    val_loader, val_loader_len = get_val_loader(args.data,
                                                args.batch_size,
                                                1000,
                                                False,
                                                workers=args.workers,
                                                fp16=args.fp16)

    if not torch.distributed.is_initialized() or torch.distributed.get_rank(
    ) == 0:
        logger = log.Logger(args.print_freq, [
            dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT,
                               step_format=log.format_step),
            dllogger.JSONStreamBackend(
                dllogger.Verbosity.VERBOSE,
                os.path.join(args.workspace, args.raport_file))
        ])

    else:
        logger = log.Logger(args.print_freq, [])

    logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT)

    optimizer = get_optimizer(list(model_and_loss.model.named_parameters()),
                              args.fp16,
                              args.lr,
                              args.momentum,
                              args.weight_decay,
                              nesterov=args.nesterov,
                              bn_weight_decay=args.bn_weight_decay,
                              state=optimizer_state,
                              static_loss_scale=args.static_loss_scale,
                              dynamic_loss_scale=args.dynamic_loss_scale)

    if args.lr_schedule == 'step':
        lr_policy = lr_step_policy(args.lr, [30, 60, 80],
                                   0.1,
                                   args.warmup,
                                   logger=logger)
    elif args.lr_schedule == 'cosine':
        lr_policy = lr_cosine_policy(args.lr,
                                     args.warmup,
                                     args.epochs,
                                     logger=logger)
    elif args.lr_schedule == 'linear':
        lr_policy = lr_linear_policy(args.lr,
                                     args.warmup,
                                     args.epochs,
                                     logger=logger)

    if args.amp:
        model_and_loss, optimizer = amp.initialize(
            model_and_loss,
            optimizer,
            opt_level="O2",
            loss_scale="dynamic"
            if args.dynamic_loss_scale else args.static_loss_scale)

    if args.distributed:
        model_and_loss.distributed()

    model_and_loss.load_model_state(model_state)

    train_loop(model_and_loss,
               optimizer,
               lr_policy,
               train_loader,
               val_loader,
               args.epochs,
               args.fp16,
               logger,
               should_backup_checkpoint(args),
               use_amp=args.amp,
               batch_size_multiplier=batch_size_multiplier,
               start_epoch=start_epoch,
               best_prec1=best_prec1,
               prof=args.prof,
               skip_training=args.evaluate,
               skip_validation=args.training_only,
               save_checkpoints=args.save_checkpoints and not args.evaluate,
               checkpoint_dir=args.workspace)
    exp_duration = time.time() - exp_start_time
    if not torch.distributed.is_initialized() or torch.distributed.get_rank(
    ) == 0:
        logger.end()
    print("Experiment ended")
Beispiel #30
0
def main():
    hvd.init()

    parser = ArgumentParser(description="Train a Variational Autoencoder for Collaborative Filtering in TensorFlow")
    parser.add_argument('--train', action='store_true',
                        help='Run training of VAE')
    parser.add_argument('--test', action='store_true',
                        help='Run validation of VAE')
    parser.add_argument('--inference', action='store_true',
                        help='Run inference on a single random example.'
                        'This can also be used to measure the latency for a batch size of 1')
    parser.add_argument('--inference_benchmark', action='store_true',
                        help='Benchmark the inference throughput on a very large batch size')
    parser.add_argument('--use_tf_amp', action='store_true',
                        help='Enable Automatic Mixed Precision')
    parser.add_argument('--epochs', type=int, default=400,
                        help='Number of epochs to train')
    parser.add_argument('--batch_size_train', type=int, default=24576,
                        help='Global batch size for training')
    parser.add_argument('--batch_size_validation', type=int, default=10000,
                        help='Used both for validation and testing')
    parser.add_argument('--validation_step', type=int, default=50,
                        help='Train epochs for one validation')
    parser.add_argument('--warm_up_epochs', type=int, default=5,
                        help='Number of epochs to omit during benchmark')
    parser.add_argument('--total_anneal_steps', type=int, default=15000,
                        help='Number of annealing steps')
    parser.add_argument('--anneal_cap', type=float, default=0.1,
                        help='Annealing cap')
    parser.add_argument('--lam', type=float, default=1.00,
                        help='Regularization parameter')
    parser.add_argument('--lr', type=float, default=0.004,
                        help='Learning rate')
    parser.add_argument('--beta1', type=float, default=0.90,
                        help='Adam beta1')
    parser.add_argument('--beta2', type=float, default=0.90,
                        help='Adam beta2')
    parser.add_argument('--top_results', type=int, default=100,
                        help='Number of results to be recommended')
    parser.add_argument('--xla', action='store_true', default=False,
                        help='Enable XLA')
    parser.add_argument('--trace', action='store_true', default=False,
                        help='Save profiling traces')
    parser.add_argument('--activation', type=str, default='tanh',
                        help='Activation function')
    parser.add_argument('--log_path', type=str, default='./vae_cf.log',
                        help='Path to the detailed training log to be created')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed for TensorFlow and numpy')
    parser.add_argument('--data_dir', default='/data', type=str,
                        help='Directory for storing the training data')
    parser.add_argument('--checkpoint_dir', type=str,
                        default=None,
                        help='Path for saving a checkpoint after the training')
    args = parser.parse_args()

    if args.batch_size_train % hvd.size() != 0:
        raise ValueError('Global batch size should be a multiple of the number of workers')

    args.local_batch_size = args.batch_size_train // hvd.size()

    logger = logging.getLogger("VAE")
    if hvd.rank() == 0:
        logger.setLevel(logging.INFO)
        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                                           filename=args.log_path),
                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
    else:
        dllogger.init(backends=[])
        logger.setLevel(logging.ERROR)

    dllogger.log(data=vars(args), step='PARAMETER')

    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # Suppress TF warnings
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    # set AMP
    os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if args.use_tf_amp else '0'

    # load dataset
    (train_data,
     validation_data_input,
     validation_data_true,
     test_data_input,
     test_data_true) = load_and_parse_ML_20M(args.data_dir)

    # make sure all dims and sizes are divisible by 8
    number_of_train_users, number_of_items = train_data.shape
    number_of_items = round_8(number_of_items)

    for data in [train_data,
                 validation_data_input,
                 validation_data_true,
                 test_data_input,
                 test_data_true]:
        number_of_users, _ = data.shape
        data.resize(number_of_users, number_of_items)

    number_of_users, number_of_items = train_data.shape
    encoder_dims = [number_of_items, 600, 200]

    vae = VAE(train_data, encoder_dims, total_anneal_steps=args.total_anneal_steps,
              anneal_cap=args.anneal_cap, batch_size_train=args.local_batch_size,
              batch_size_validation=args.batch_size_validation, lam=args.lam,
              lr=args.lr, beta1=args.beta1, beta2=args.beta2, activation=args.activation,
              xla=args.xla, checkpoint_dir=args.checkpoint_dir, trace=args.trace,
              top_results=args.top_results)

    metrics = {'ndcg@100': partial(ndcg, R=100),
               'recall@20': partial(recall, R=20),
               'recall@50': partial(recall, R=50)}

    if args.train:
        vae.train(n_epochs=args.epochs, validation_data_input=validation_data_input,
                  validation_data_true=validation_data_true,  metrics=metrics,
                  validation_step=args.validation_step)

    if args.test and hvd.size() <= 1:
        test_results = vae.test(test_data_input=test_data_input,
                                test_data_true=test_data_true, metrics=metrics)

        for k, v in test_results.items():
            print("{}:\t{}".format(k, v))
    elif args.test and hvd.size() > 1:
        print("Testing is not supported with horovod multigpu yet")

    if args.inference_benchmark and hvd.size() <= 1:
        # use the train data to get accurate throughput numbers for inference
        # the test and validation sets are too small to measure this accurately
        # vae.inference_benchmark()
        _ = vae.test(test_data_input=train_data,
                     test_data_true=train_data, metrics={})
        

    elif args.test and hvd.size() > 1:
        print("Testing is not supported with horovod multigpu yet")

    if args.inference:
        input_data = np.random.randint(low=0, high=10000, size=10)
        recommendations = vae.query(input_data=input_data)
        print('Recommended item indices: ', recommendations)

    vae.close_session()
    dllogger.flush()