def test_quantization_saved(self):
        from lpot.utils.pytorch import load

        model = copy.deepcopy(self.model)

        for fake_yaml in ['qat_yaml.yaml', 'ptq_yaml.yaml']:
            if fake_yaml == 'ptq_yaml.yaml':
                model.eval().fuse_model()
            quantizer = Quantization(fake_yaml)
            dataset = quantizer.dataset('dummy', (100, 3, 256, 256),
                                        label=True)
            quantizer.model = common.Model(model)
            quantizer.calib_dataloader = common.DataLoader(dataset)
            quantizer.eval_dataloader = common.DataLoader(dataset)
            if fake_yaml == 'qat_yaml.yaml':
                quantizer.q_func = q_func
            q_model = quantizer()
            q_model.save('./saved')
            # Load configure and weights by lpot.utils
            saved_model = load("./saved", model)
            eval_func(saved_model)
        from lpot import Benchmark
        evaluator = Benchmark('ptq_yaml.yaml')
        # Load configure and weights by lpot.model
        evaluator.model = common.Model(model)
        evaluator.b_dataloader = common.DataLoader(dataset)
        results = evaluator()
        evaluator.model = common.Model(model)
        fp32_results = evaluator()
        self.assertTrue(
            (fp32_results['accuracy'][0] - results['accuracy'][0]) < 0.01)
Exemple #2
0
 def test_quantization_saved(self):
     for fake_yaml in [
             'dynamic_yaml.yaml', 'qat_yaml.yaml', 'ptq_yaml.yaml'
     ]:
         if fake_yaml == 'dynamic_yaml.yaml':
             model = torchvision.models.resnet18()
         else:
             model = copy.deepcopy(self.model)
         if fake_yaml == 'ptq_yaml.yaml':
             model.eval().fuse_model()
         quantizer = Quantization(fake_yaml)
         dataset = quantizer.dataset('dummy', (100, 3, 256, 256),
                                     label=True)
         quantizer.model = common.Model(model)
         if fake_yaml == 'qat_yaml.yaml':
             quantizer.q_func = q_func
         else:
             quantizer.calib_dataloader = common.DataLoader(dataset)
         quantizer.eval_dataloader = common.DataLoader(dataset)
         q_model = quantizer()
         q_model.save('./saved')
         # Load configure and weights by lpot.utils
         saved_model = load("./saved", model)
         eval_func(saved_model)
         shutil.rmtree('./saved', ignore_errors=True)
     from lpot.experimental import Benchmark
     evaluator = Benchmark('ptq_yaml.yaml')
     # Load configure and weights by lpot.model
     evaluator.model = common.Model(model)
     evaluator.b_dataloader = common.DataLoader(dataset)
     evaluator()
     evaluator.model = common.Model(model)
     evaluator()
Exemple #3
0
def main():
    class CalibrationDL():
        def __init__(self):
            path = os.path.abspath(
                os.path.expanduser('./brats_cal_images_list.txt'))
            with open(path, 'r') as f:
                self.preprocess_files = [line.rstrip() for line in f]

            self.loaded_files = {}
            self.batch_size = 1

        def __getitem__(self, sample_id):
            file_name = self.preprocess_files[sample_id]
            print("Loading file {:}".format(file_name))
            with open(
                    os.path.join('build/calib_preprocess/',
                                 "{:}.pkl".format(file_name)), "rb") as f:
                self.loaded_files[sample_id] = pickle.load(f)[0]
            return torch.from_numpy(
                self.loaded_files[sample_id][np.newaxis, ...]).float(), None

        def __len__(self):
            self.count = len(self.preprocess_files)
            return self.count

    args = get_args()
    assert args.backend == "pytorch"
    model_path = os.path.join(args.model_dir, "plans.pkl")
    assert os.path.isfile(
        model_path), "Cannot find the model file {:}!".format(model_path)
    trainer, params = load_model_and_checkpoint_files(
        args.model_dir,
        folds=1,
        fp16=False,
        checkpoint_name='model_final_checkpoint')
    trainer.load_checkpoint_ram(params[0], False)
    model = trainer.network

    if args.tune:
        quantizer = Quantization('conf.yaml')
        quantizer.model = common.Model(model)
        quantizer.eval_func = eval_func
        calib_dl = CalibrationDL()
        quantizer.calib_dataloader = calib_dl
        q_model = quantizer()
        q_model.save('./lpot_workspace')
        exit(0)

    if args.benchmark:
        model.eval()
        if args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser('./lpot_workspace')), model)
        else:
            new_model = model
        eval_func(new_model)
Exemple #4
0
    def test_fx_dynamic_quant(self):
        # Model Definition
        class LSTMModel(nn.Module):
            """Container module with an encoder, a recurrent module, and a decoder."""
            def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
                super(LSTMModel, self).__init__()
                self.drop = nn.Dropout(dropout)
                self.encoder = nn.Embedding(ntoken, ninp)
                self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
                self.decoder = nn.Linear(nhid, ntoken)
                self.init_weights()
                self.nhid = nhid
                self.nlayers = nlayers

            def init_weights(self):
                initrange = 0.1
                self.encoder.weight.data.uniform_(-initrange, initrange)
                self.decoder.bias.data.zero_()
                self.decoder.weight.data.uniform_(-initrange, initrange)

            def forward(self, input, hidden):
                emb = self.drop(self.encoder(input))
                output, hidden = self.rnn(emb, hidden)
                output = self.drop(output)
                decoded = self.decoder(output)
                return decoded, hidden

        version = get_torch_version()
        if version >= '1.8':
            model = LSTMModel(
                ntoken=10,
                ninp=512,
                nhid=256,
                nlayers=5,
            )

            # run fx_quant in lpot and save the quantized GraphModule
            model.eval()
            quantizer = Quantization('fx_dynamic_yaml.yaml')
            quantizer.model = common.Model(model, **{'a': 1})
            q_model = quantizer()
            q_model.save('./saved_dynamic_fx')

            # Load configure and weights by lpot.utils
            model_fx = load("./saved_dynamic_fx", model, **{'a': 1})
            if version >= '1.8':
                self.assertTrue(
                    isinstance(model_fx, torch.fx.graph_module.GraphModule))
            else:
                self.assertTrue(
                    isinstance(model_fx, torch._fx.graph_module.GraphModule))
Exemple #5
0
    def test_fx_quant(self):
        version = get_torch_version()
        if version >= '1.8':
            model_origin = torchvision.models.resnet18()

            # run fx_quant in lpot and save the quantized GraphModule
            quantizer = Quantization('fx_ptq_yaml.yaml')
            dataset = quantizer.dataset('dummy', (10, 3, 224, 224), label=True)
            quantizer.calib_dataloader = common.DataLoader(dataset)
            quantizer.eval_func = eval_func
            quantizer.model = common.Model(model_origin, **{'a': 1})
            q_model = quantizer()
            q_model.save('./saved_static_fx')

            # Load configure and weights by lpot.utils
            model_fx = load("./saved_static_fx", model_origin, **{'a': 1})
            self.assertTrue(
                isinstance(model_fx, torch.fx.graph_module.GraphModule))
Exemple #6
0
 def test_quantization_saved(self):
     from lpot import Quantization
     from lpot.utils.pytorch import load
     model = copy.deepcopy(self.model)
     for fake_yaml in ['qat_yaml.yaml', 'ptq_yaml.yaml']:
         if fake_yaml == 'ptq_yaml.yaml':
             model.eval()
             model.fuse_model()
         quantizer = Quantization(fake_yaml)
         dataset = quantizer.dataset('dummy', (100, 3, 256, 256),
                                     label=True)
         dataloader = quantizer.dataloader(dataset)
         q_model = quantizer(
             model,
             q_func=q_func if fake_yaml == 'qat_yaml.yaml' else None,
             q_dataloader=dataloader,
             eval_dataloader=dataloader)
         new_model = load('./saved/checkpoint', model)
         eval_func(new_model)
     from lpot import Benchmark
     evaluator = Benchmark('ptq_yaml.yaml')
     results = evaluator(model=new_model, b_dataloader=dataloader)
Exemple #7
0
def main():
    args = parser.parse_args()
    print(args)

    if args.img_size is None:
        args.img_size, args.crop_pct = get_image_size_crop_pct(args.model)

    if not args.checkpoint and not args.pretrained:
        args.pretrained = True

    if args.torchscript:
        geffnet.config.set_scriptable(True)

    # create model
    model = geffnet.create_model(args.model,
                                 num_classes=args.num_classes,
                                 in_chans=3,
                                 pretrained=args.pretrained,
                                 checkpoint_path=args.checkpoint)

    if args.torchscript:
        torch.jit.optimized_execution(True)
        model = torch.jit.script(model)

    print('Model %s created, param count: %d' %
          (args.model, sum([m.numel() for m in model.parameters()])))

    data_config = resolve_data_config(model, args)

    criterion = nn.CrossEntropyLoss()

    if not args.no_cuda:
        if args.num_gpu > 1:
            model = torch.nn.DataParallel(model,
                                          device_ids=list(range(
                                              args.num_gpu))).cuda()
        else:
            model = model.cuda()
        criterion = criterion.cuda()

    if args.tune:
        model.eval()
        model.fuse_model()
        conf_yaml = "conf_" + args.model + ".yaml"
        from lpot import Quantization
        quantizer = Quantization(conf_yaml)
        q_model = quantizer(model)
        exit(0)

    valdir = os.path.join(args.data, 'val')
    loader = create_loader(Dataset(valdir, load_bytes=args.tf_preprocessing),
                           input_size=data_config['input_size'],
                           batch_size=args.batch_size,
                           use_prefetcher=not args.no_cuda,
                           interpolation=data_config['interpolation'],
                           mean=data_config['mean'],
                           std=data_config['std'],
                           num_workers=args.workers,
                           crop_pct=data_config['crop_pct'],
                           tensorflow_preprocessing=args.tf_preprocessing)

    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    model.eval()
    model.fuse_model()
    if args.int8:
        from lpot.utils.pytorch import load
        new_model = load(
            os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model)
    else:
        new_model = model

    with torch.no_grad():
        for i, (input, target) in enumerate(loader):
            if i >= args.warmup_iterations:
                start = time.time()
            if not args.no_cuda:
                target = target.cuda()
                input = input.cuda()

            # compute output
            output = new_model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(prec1.item(), input.size(0))
            top5.update(prec5.item(), input.size(0))

            if i >= args.warmup_iterations:
                # measure elapsed time
                batch_time.update(time.time() - start)

            if i % args.print_freq == 0:
                print(
                    'Test: [{0}/{1}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s) \t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                    'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                        i,
                        len(loader),
                        batch_time=batch_time,
                        rate_avg=input.size(0) / batch_time.avg,
                        loss=losses,
                        top1=top1,
                        top5=top5))
            if args.iterations > 0 and i >= args.iterations + args.warmup_iterations - 1:
                break

        print('Batch size = %d' % args.batch_size)
        if args.batch_size == 1:
            print('Latency: %.3f ms' % (batch_time.avg * 1000))
        print('Throughput: %.3f images/sec' %
              (args.batch_size / batch_time.avg))
        print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}'.format(
            top1=(top1.avg / 100), top5=(top5.avg / 100)))
Exemple #8
0
def main():
    # init the args
    args = Options().parse()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    print(args)
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
    # init dataloader
    interp = PIL.Image.BILINEAR if args.crop_size < 320 else PIL.Image.BICUBIC
    base_size = args.base_size if args.base_size is not None else int(1.0 * args.crop_size / 0.875)
    transform_val = transforms.Compose([
        ECenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    valset = ImageNetDataset(args.data, transform=transform_val, train=False)
    val_loader = torch.utils.data.DataLoader(
        valset, batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True if args.cuda else False)

    # assert args.model in torch.hub.list('zhanghang1989/ResNeSt', force_reload=True)
    functions = inspect.getmembers(module, inspect.isfunction)
    model_list = [f[0] for f in functions]
    assert args.model in model_list
    get_model = importlib.import_module('resnest.torch')
    net = getattr(get_model, args.model)
    # model = torch.hub.load('zhanghang1989/ResNeSt', args.model, pretrained=True)
    model = net(pretrained=True)
    # print(model)

    if args.cuda:
        model.cuda()
        # Please use CUDA_VISIBLE_DEVICES to control the number of gpus
        model = nn.DataParallel(model)

    # checkpoint
    if args.verify:
        if os.path.isfile(args.verify):
            print("=> loading checkpoint '{}'".format(args.verify))
            model.module.load_state_dict(torch.load(args.verify))
        else:
            raise RuntimeError("=> no verify checkpoint found at '{}'".
                               format(args.verify))
    elif args.resume is not None:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            model.module.load_state_dict(checkpoint['state_dict'])
        else:
            raise RuntimeError("=> no resume checkpoint found at '{}'".
                               format(args.resume))

    model.eval()
    model.fuse_model()

    if args.tune:
        from lpot.experimental import Quantization, common
        quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)
        exit(0)

    if args.int8:
        from lpot.utils.pytorch import load
        new_model = load(
            os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model)
    else:
        new_model = model

    top1 = AverageMeter()
    top5 = AverageMeter()
    batch_time = AverageMeter()
    iterations = args.iterations
    warmup = args.warmup_iterations
    tbar = tqdm(val_loader, desc='\r')
    for batch_idx, (data, target) in enumerate(tbar):
        if iterations == 0 or batch_idx < iterations + warmup:
            if batch_idx >= warmup:
                end = time.time()
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            with torch.no_grad():
                output = new_model(data)
                if batch_idx >= warmup:
                    batch_time.update(time.time() - end)
                acc1, acc5 = accuracy(output, target, topk=(1, 5))
                top1.update(acc1[0], data.size(0))
                top5.update(acc5[0], data.size(0))

            tbar.set_description('Top1: %.3f | Top5: %.3f' % (top1.avg, top5.avg))
        elif batch_idx == iterations + warmup:
            break

    print('Batch size = %d' % args.batch_size)
    if args.batch_size == 1:
        print('Latency: %.3f ms' % (batch_time.avg * 1000))
    print('Throughput: %.3f images/sec' % (args.batch_size / batch_time.avg))
    print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}'
          .format(top1=(top1.avg / 100), top5=(top5.avg / 100)))
Exemple #9
0
def main():
    global args, best_acc1
    args = parser.parse_args()
    print('args:', args)

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size)

    # Val data loading
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    val_dataset = datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(args.input_dim+32),
            transforms.CenterCrop(args.input_dim),
            transforms.ToTensor(),
            normalize,
        ]))

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size, shuffle=False,
                                             num_workers=args.workers, pin_memory=True)

    num_classes = len(val_dataset.classes)
    print('Total classes: ',num_classes)

    # create model
    print("=> creating model '{}'".format(args.arch))
    if args.arch == 'peleenet':
        model = PeleeNet(num_classes=num_classes)          
    else:
        print("=> unsupported model '{}'. creating PeleeNet by default.".format(args.arch))
        model = PeleeNet(num_classes=num_classes)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # DataParallel will divide
        model = torch.nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    elif args.pretrained:
        if os.path.isfile(args.weights):
            checkpoint = torch.load(args.weights, map_location=torch.device('cpu'))
            model.load_state_dict(checkpoint['state_dict'])

            print("=> loaded checkpoint '{}' (epoch {}, acc@1 {})"
                  .format(args.pretrained, checkpoint['epoch'], checkpoint['best_acc1']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    if args.tune:
        model.eval()
        model.module.fuse_model()
        from lpot import Quantization
        quantizer = Quantization("./conf.yaml")
        q_model = quantizer(model)
        exit(0)

    if args.benchmark:
        model.eval()
        model.module.fuse_model()
        if args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model)
        else:
            new_model = model
        validate(val_loader, new_model, criterion, args)
        exit(0)

    # Training data loading
    traindir = os.path.join(args.data, 'train')

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(args.input_dim),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best Acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_acc1': best_acc1,
            'optimizer': optimizer.state_dict(),
        }, is_best)
Exemple #10
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
    # label if at least two columns are provided.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset("glue", data_args.task_name)
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.
        data_files = {
            "train": data_args.train_file,
            "validation": data_args.validation_file
        }

        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
        # when you use `do_predict` without specifying a GLUE benchmark task.
        if training_args.do_predict:
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
                assert (
                    test_extension == train_extension
                ), "`test_file` should have the same extension (csv or json) as `train_file`."
                data_files["test"] = data_args.test_file
            else:
                raise ValueError(
                    "Need either a GLUE task or a test file for `do_predict`.")

        for key in data_files.keys():
            logger.info(f"load a local file for {key}: {data_files[key]}")

        if data_args.train_file.endswith(".csv"):
            # Loading a dataset from local csv files
            datasets = load_dataset("csv", data_files=data_files)
        else:
            # Loading a dataset from local json files
            datasets = load_dataset("json", data_files=data_files)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Labels
    if data_args.task_name is not None:
        is_regression = data_args.task_name == "stsb"
        if not is_regression:
            label_list = datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
        is_regression = datasets["train"].features["label"].dtype in [
            "float32", "float64"
        ]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Preprocessing the datasets
    if data_args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
        non_label_column_names = [
            name for name in datasets["train"].column_names if name != "label"
        ]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
            if len(non_label_column_names) >= 2:
                sentence1_key, sentence2_key = non_label_column_names[:2]
            else:
                sentence1_key, sentence2_key = non_label_column_names[0], None

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if (model.config.label2id !=
            PretrainedConfig(num_labels=num_labels).label2id
            and data_args.task_name is not None and not is_regression):
        # Some have all caps in their config, some don't.
        label_name_to_id = {
            k.lower(): v
            for k, v in model.config.label2id.items()
        }
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {
                i: int(label_name_to_id[label_list[i]])
                for i in range(num_labels)
            }
        else:
            logger.warn(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
    elif data_args.task_name is None and not is_regression:
        label_to_id = {v: i for i, v in enumerate(label_list)}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warn(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [(label_to_id[l] if l != -1 else -1)
                               for l in examples["label"]]
        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    train_dataset = datasets["train"]
    eval_dataset = datasets["validation_matched" if data_args.task_name ==
                            "mnli" else "validation"]
    if data_args.task_name is not None or data_args.test_file is not None:
        test_dataset = datasets["test_matched" if data_args.task_name ==
                                "mnli" else "test"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # Get the metric function
    if data_args.task_name is not None:
        metric = load_metric("glue", data_args.task_name)
    # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from
    # compute_metrics

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        if data_args.task_name is not None:
            result = metric.compute(predictions=preds, references=p.label_ids)
            if len(result) > 1:
                result["combined_score"] = np.mean(list(
                    result.values())).item()
            return result
        elif is_regression:
            return {"mse": ((preds - p.label_ids)**2).mean().item()}
        else:
            return {
                "accuracy":
                (preds == p.label_ids).astype(np.float32).mean().item()
            }

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    if training_args.tune:

        def eval_func_for_lpot(model_tuned):
            trainer = Trainer(
                model=model_tuned,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                compute_metrics=compute_metrics,
                tokenizer=tokenizer,
                data_collator=data_collator,
            )
            result = trainer.evaluate(eval_dataset=eval_dataset)
            bert_task_acc_keys = [
                'eval_f1', 'eval_accuracy', 'mcc', 'spearmanr', 'acc'
            ]
            for key in bert_task_acc_keys:
                if key in result.keys():
                    logger.info("Finally Eval {}:{}".format(key, result[key]))
                    acc = result[key]
                    break
            return acc

        from lpot.experimental import Quantization, common
        from transformers.data.data_collator import default_data_collator_lpot
        quantizer = Quantization("./conf.yaml")
        # calibration_dataset = quantizer.dataset('bert', dataset=eval_dataset,
        #                                  task="classifier", model_type=config.model_type)
        quantizer.model = common.Model(model)
        quantizer.calib_dataloader = common.DataLoader(
            eval_dataset,
            batch_size=training_args.per_device_eval_batch_size,
            collate_fn=default_data_collator_lpot)
        quantizer.eval_func = eval_func_for_lpot
        q_model = quantizer()
        q_model.save(training_args.tuned_checkpoint)
        exit(0)

    if training_args.accuracy_only:
        if training_args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(
                    os.path.expanduser(training_args.tuned_checkpoint)), model)
        else:
            new_model = model
        trainer = Trainer(
            model=new_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        results = trainer.evaluate(eval_dataset=eval_dataset)
        bert_task_acc_keys = [
            'eval_f1', 'eval_accuracy', 'mcc', 'spearmanr', 'acc'
        ]
        for key in bert_task_acc_keys:
            if key in results.keys():
                acc = results[key]
                break
        print("Accuracy: %.5f" % acc)
        print('Throughput: %.3f samples/sec' %
              (results["eval_samples_per_second"]))
        print('Latency: %.3f ms' %
              (1 * 1000 / results["eval_samples_per_second"]))
        print('Batch size = %d' % training_args.per_gpu_eval_batch_size)
        exit(0)

    if training_args.benchmark:
        if training_args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(
                    os.path.expanduser(training_args.tuned_checkpoint)), model)
        else:
            new_model = model
        trainer = Trainer(
            model=new_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        results = trainer.evaluate(eval_dataset=eval_dataset,
                                   iters=training_args.iters,
                                   warmup_iter=training_args.warmup_iter)
        bert_task_acc_keys = [
            'eval_f1', 'eval_accuracy', 'mcc', 'spearmanr', 'acc'
        ]
        for key in bert_task_acc_keys:
            if key in results.keys():
                acc = results[key]
                break
        print("Accuracy: %.5f" % acc)
        print('Throughput: %.3f samples/sec' %
              (results["eval_samples_per_second"]))
        print('Latency: %.3f ms' %
              (1 * 1000 / results["eval_samples_per_second"]))
        print('Batch size = %d' % training_args.per_gpu_eval_batch_size)
        exit(0)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(datasets["validation_mismatched"])

        for eval_dataset, task in zip(eval_datasets, tasks):
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            trainer.log_metrics("eval", eval_result)
            trainer.save_metrics("eval", eval_result)

            eval_results.update(eval_result)

    if training_args.do_predict:
        logger.info("*** Test ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(datasets["test_mismatched"])

        for test_dataset, task in zip(test_datasets, tasks):
            # Removing the `label` columns because it contains -1 and Trainer won't like that.
            test_dataset.remove_columns_("label")
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            predictions = np.squeeze(
                predictions) if is_regression else np.argmax(predictions,
                                                             axis=1)

            output_test_file = os.path.join(training_args.output_dir,
                                            f"test_results_{task}.txt")
            if trainer.is_world_process_zero():
                with open(output_test_file, "w") as writer:
                    logger.info(f"***** Test results {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = label_list[item]
                            writer.write(f"{index}\t{item}\n")
    return eval_results
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    check_output_dir(training_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED),
        training_args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout",
                          "attention_dropout")
    for p in extra_model_params:
        if getattr(training_args, p, None):
            assert hasattr(
                config, p
            ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute"
            setattr(config, p, getattr(training_args, p))

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_args.model_name_or_path,
        from_tf=".ckpt" in model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # use task specific params
    use_task_specific_params(model, data_args.task)

    # set num_beams for evaluation
    if data_args.eval_beams is None:
        data_args.eval_beams = model.config.num_beams

    # set decoder_start_token_id for MBart
    if model.config.decoder_start_token_id is None and isinstance(
            tokenizer, MBartTokenizer):
        assert (data_args.tgt_lang is not None and data_args.src_lang
                is not None), "mBart requires --tgt_lang and --src_lang"
        model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
            data_args.tgt_lang]

    if model_args.freeze_embeds:
        freeze_embeds(model)
    if model_args.freeze_encoder:
        freeze_params(model.get_encoder())
        assert_all_frozen(model.get_encoder())

    dataset_class = Seq2SeqDataset

    # Get datasets
    train_dataset = (dataset_class(
        tokenizer,
        type_path="train",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_train,
        max_target_length=data_args.max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_train else None)
    eval_dataset = (dataset_class(
        tokenizer,
        type_path="val",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_val,
        max_target_length=data_args.val_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_eval or
                    training_args.evaluation_strategy != EvaluationStrategy.NO
                    else None)
    test_dataset = (dataset_class(
        tokenizer,
        type_path="test",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_test,
        max_target_length=data_args.test_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_predict else None)

    # Initialize our Trainer
    compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer)
                          if training_args.predict_with_generate else None)
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=Seq2SeqDataCollator(tokenizer, data_args,
                                          training_args.tpu_num_cores),
        compute_metrics=compute_metrics_fn,
        tokenizer=tokenizer,
    )

    all_metrics = {}
    # Training
    if training_args.do_train:
        logger.info("*** Train ***")

        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.
            isdir(model_args.model_name_or_path) else None)
        metrics = train_result.metrics
        metrics["train_n_objs"] = data_args.n_train

        trainer.save_model()  # this also saves the tokenizer

        if trainer.is_world_process_zero():
            handle_metrics("train", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

            # For convenience, we also re-save the tokenizer to the same directory,
            # so that you can share your model easily on huggingface.co/models =)
            tokenizer.save_pretrained(training_args.output_dir)

    if training_args.tune:

        def eval_func_for_lpot(model):
            trainer.model = model
            results = trainer.evaluate(
                eval_dataset=eval_dataset,
                metric_key_prefix="val",
                max_length=data_args.val_max_target_length,
                num_beams=data_args.eval_beams)
            assert data_args.task.startswith("summarization") or data_args.task.startswith("translation") , \
                "data_args.task should startswith summarization or translation"
            task_metrics_keys = [
                'val_bleu', 'val_rouge1', 'val_rouge2', 'val_rougeL',
                'val_rougeLsum'
            ]
            for key in task_metrics_keys:
                if key in results.keys():
                    logger.info("Finally Eval {}:{}".format(key, results[key]))
                    if 'bleu' in key:
                        acc = results[key]
                        break
                    if 'rouge' in key:
                        acc = sum(
                            [v
                             for k, v in results.items() if "rouge" in k]) / 4
                        break
            return acc

        from lpot.experimental import Quantization, common
        quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        quantizer.calib_dataloader = common.DataLoader(
            eval_dataset,
            batch_size=training_args.eval_batch_size,
            collate_fn=Seq2SeqDataCollator_lpot(tokenizer, data_args,
                                                training_args.tpu_num_cores))
        quantizer.eval_func = eval_func_for_lpot
        q_model = quantizer()
        q_model.save(training_args.tuned_checkpoint)
        exit(0)

    if training_args.benchmark:
        if training_args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(
                    os.path.expanduser(training_args.tuned_checkpoint)), model)
        else:
            new_model = model
        trainer.model = new_model
        results = trainer.evaluate(
            eval_dataset=eval_dataset,
            metric_key_prefix="val",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
            iters=training_args.iters,
            warmup_iter=training_args.warmup_iter,
        )
        if data_args.task.startswith("summarization"):
            print('Accuracy: %.4f' %
                  (sum([v for k, v in results.items() if "rouge" in k]) / 4))
        if data_args.task.startswith("translation"):
            print('Accuracy: %.4f' % (results['val_bleu']))
        print('Throughput: %.3f samples/sec' %
              (results["val_samples_per_second"]))
        print('Latency: %.3f ms' %
              (1 * 1000 / results["val_samples_per_second"]))
        print('Batch size = %d' % training_args.per_device_eval_batch_size)
        exit(0)

    if training_args.accuracy_only:
        if training_args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(
                    os.path.expanduser(training_args.tuned_checkpoint)), model)
        else:
            new_model = model
        trainer.model = new_model
        results = trainer.evaluate(
            eval_dataset=eval_dataset,
            metric_key_prefix="val",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
        )
        if data_args.task.startswith("summarization"):
            print('Accuracy: %.4f' %
                  (sum([v for k, v in results.items() if "rouge" in k]) / 4))
        if data_args.task.startswith("translation"):
            print('Accuracy: %.4f' % (results['val_bleu']))
        print('Latency: %.3f ms' %
              (1 * 1000 / results["val_samples_per_second"]))
        print('Batch size = %d' % training_args.per_device_eval_batch_size)
        exit(0)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(
            metric_key_prefix="val",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
        )
        metrics["val_n_objs"] = data_args.n_val
        metrics["val_loss"] = round(metrics["val_loss"], 4)

        if trainer.is_world_process_zero():

            handle_metrics("val", metrics, training_args.output_dir)
            all_metrics.update(metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")

        test_output = trainer.predict(
            test_dataset=test_dataset,
            metric_key_prefix="test",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
        )
        metrics = test_output.metrics
        metrics["test_n_objs"] = data_args.n_test

        if trainer.is_world_process_zero():
            metrics["test_loss"] = round(metrics["test_loss"], 4)
            handle_metrics("test", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            if training_args.predict_with_generate:
                test_preds = tokenizer.batch_decode(
                    test_output.predictions,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True)
                test_preds = lmap(str.strip, test_preds)
                write_txt_file(
                    test_preds,
                    os.path.join(training_args.output_dir,
                                 "test_generations.txt"))

    if trainer.is_world_process_zero():
        save_json(all_metrics,
                  os.path.join(training_args.output_dir, "all_results.json"))

    return all_metrics
Exemple #12
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    pytorch_version = get_torch_version()
    #args.gpu = gpu
    #affinity = subprocess.check_output("lscpu | grep 'NUMA node[0-9]' | awk '{ print $4 }' | awk -F',' '{ print $1 }'", shell=True)
    #os.environ['OMP_NUM_THREADS'] = '28'
    #os.environ['KMP_AFFINITY'] = 'proclist=[{}],granularity=thread,explicit'.format(affinity.splitlines()[gpu].decode('utf-8'))
    #print (os.environ['KMP_AFFINITY'])

    #if args.gpu is not None:
    #    print("Use GPU: {} for training".format(args.gpu))
    print("Use CPU: {} for training".format(gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        if args.ipex or pytorch_version >= '1.7':
            model = models.__dict__[args.arch](pretrained=True)
        else:
            model = quantize_models.__dict__[args.arch](pretrained=True,
                                                        quantize=False)
    else:
        print("=> creating model '{}'".format(args.arch))
        if args.ipex:
            model = models.__dict__[args.arch]()
        else:
            model = quantize_models.__dict__[args.arch]()

    if not torch.cuda.is_available():
        print('using CPU...')
    elif args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            #model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallelCPU(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    #criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    #cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)

    if args.tune:
        from lpot.experimental import Quantization, common
        if args.ipex:
            quantizer = Quantization("./conf_ipex.yaml")
        else:
            model.eval()
            if pytorch_version < '1.7':
                model.fuse_model()
            quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)
        return

    if args.benchmark or args.accuracy_only:
        model.eval()
        ipex_config_path = None
        if args.int8:
            if args.ipex:
                # TODO: It will remove when IPEX spport to save script model.
                model.to(ipex.DEVICE)
                try:
                    new_model = torch.jit.script(model)
                except:
                    new_model = torch.jit.trace(
                        model,
                        torch.randn(1, 3, 224, 224).to(ipex.DEVICE))
                ipex_config_path = os.path.join(
                    os.path.expanduser(args.tuned_checkpoint),
                    "best_configure.json")
            else:
                if pytorch_version < '1.7':
                    model.fuse_model()
                from lpot.utils.pytorch import load
                new_model = load(
                    os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
                    model)
        else:
            if args.ipex:
                # TODO: It will remove when IPEX spport to save script model.
                model.to(ipex.DEVICE)
                try:
                    new_model = torch.jit.script(model)
                except:
                    new_model = torch.jit.trace(
                        model,
                        torch.randn(1, 3, 224, 224).to(ipex.DEVICE))
            else:
                if pytorch_version < '1.7':
                    model.fuse_model()
                new_model = model
        validate(val_loader, new_model, criterion, args, ipex_config_path)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
Exemple #13
0
def main():
    args = get_args()

    if args.backend == "pytorch":
        from pytorch_SUT import PytorchSUT
        sut = PytorchSUT(args.pytorch_config_toml, args.pytorch_checkpoint,
                         args.dataset_dir, args.manifest, args.perf_count)
        model = sut.greedy_decoder._model
    else:
        raise ValueError("Unknown backend: {:}".format(args.backend))

    settings = lg.TestSettings()
    settings.scenario = scenario_map[args.scenario]
    settings.FromConfig(args.mlperf_conf, "rnnt", args.scenario)
    settings.FromConfig(args.user_conf, "rnnt", args.scenario)

    if args.accuracy:
        settings.mode = lg.TestMode.AccuracyOnly
    else:
        settings.mode = lg.TestMode.PerformanceOnly

    log_path = args.log_dir
    os.makedirs(log_path, exist_ok=True)
    log_output_settings = lg.LogOutputSettings()
    log_output_settings.outdir = log_path
    log_output_settings.copy_summary_to_stdout = True
    log_settings = lg.LogSettings()
    log_settings.log_output = log_output_settings

    pattern = [
        'accuracy=\d+.\d+', 'samples_per_query : \d+',
        'Samples per second: \d+.\d+'
    ]

    def eval_func(model):
        print("Running Loadgen test...")
        sut.greedy_decoder._model = model
        lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings,
                                    log_settings)
        if args.accuracy:
            cmd = f"python3 accuracy_eval.py --log_dir {log_path} \
               --dataset_dir {args.dataset_dir} --manifest {args.manifest}"

            out = subprocess.check_output(cmd, shell=True)
            out = out.decode()
            regex_accu = re.compile(pattern[0])
            accu = float(regex_accu.findall(out)[0].split('=')[1])
            return accu
        return 0

    def perf_func(model):
        print("Running Loadgen test...")
        sut.greedy_decoder._model = model
        lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings,
                                    log_settings)
        if not args.accuracy:
            file_path = os.path.join(log_path, 'mlperf_log_summary.txt')
            f = open(file_path, 'r', encoding='UTF-8')
            file_content = f.read()
            f.close()
            regex_batch = re.compile(pattern[1])
            regex_thro = re.compile(pattern[2])
            samples_per_query = int(
                regex_batch.findall(file_content)[0].split(': ')[1])
            samples_per_second = float(
                regex_thro.findall(file_content)[0].split(': ')[1])
            print('Batch size = %d' % samples_per_query)
            print('Latency: %.3f ms' % ((1 / samples_per_second) * 1000))
            print('Throughput: %.3f samples/sec' % samples_per_second)

    if args.tune:
        # Dynamic Quantization with LPOT
        from lpot.experimental import Quantization, common
        quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        quantizer.eval_func = eval_func
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)

    if args.benchmark:
        if args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
                model)
        else:
            new_model = model
        perf_func(new_model)

    print("Done!", flush=True)
Exemple #14
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    # create model
    print("=> creating model '{}'".format(args.arch))
    if args.pretrained.lower() not in ['false', 'none', 'not', 'no', '0']:
        print("=> using pre-trained parameters '{}'".format(args.pretrained))
        model = pretrainedmodels.__dict__[args.arch](
            num_classes=1000, pretrained=args.pretrained)
    else:
        model = pretrainedmodels.__dict__[args.arch]()

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    # traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    # train_loader = torch.utils.data.DataLoader(
    #     datasets.ImageFolder(traindir, transforms.Compose([
    #         transforms.RandomSizedCrop(max(model.input_size)),
    #         transforms.RandomHorizontalFlip(),
    #         transforms.ToTensor(),
    #         normalize,
    #     ])),
    #     batch_size=args.batch_size, shuffle=True,
    #     num_workers=args.workers, pin_memory=True)

    # if 'scale' in pretrainedmodels.pretrained_settings[args.arch][args.pretrained]:
    #     scale = pretrainedmodels.pretrained_settings[args.arch][args.pretrained]['scale']
    # else:
    #     scale = 0.875
    scale = 0.875

    print('Images transformed from size {} to {}'.format(
        int(round(max(model.input_size) / scale)), model.input_size))

    val_tf = pretrainedmodels.utils.TransformImage(
        model, scale=scale, preserve_aspect_ratio=args.preserve_aspect_ratio)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir, val_tf),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    model = torch.nn.DataParallel(model)

    if args.tune:
        model.eval()
        model.module.fuse_model()
        from lpot.experimental import Quantization, common
        quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)
        return

    if args.benchmark:
        model.eval()
        model.module.fuse_model()
        if args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
                model)
        else:
            new_model = model
        validate(val_loader, new_model, criterion, args)
        return

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion, args)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
            }, is_best)
Exemple #15
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_data_file", default=None, type=str, required=True,
                        help="The input training data file (a text file).")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--eval_data_file", default=None, type=str,
                        help="An optional input evaluation data file to evaluate the perplexity on (a text file).")

    parser.add_argument("--model_type", default="bert", type=str,
                        help="The model architecture to be fine-tuned.")
    parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str,
                        help="The model checkpoint for weights initialization.")

    parser.add_argument("--mlm", action='store_true',
                        help="Train with masked-language modeling loss instead of language modeling.")
    parser.add_argument("--mlm_probability", type=float, default=0.15,
                        help="Ratio of tokens to mask for masked language modeling loss")

    parser.add_argument("--config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
    parser.add_argument("--cache_dir", default="", type=str,
                        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
    parser.add_argument("--block_size", default=-1, type=int,
                        help="Optional input sequence length after tokenization."
                             "The training dataset will be truncated in block of this size for training."
                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--evaluate_during_training", action='store_true',
                        help="Run evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=1.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps', type=int, default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps', type=int, default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument('--save_total_limit', type=int, default=None,
                        help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default')
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--overwrite_cache', action='store_true',
                        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")

    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    parser.add_argument("--tune", action='store_true',
                        help="run Low Precision Optimization Tool to tune int8 acc.")
    parser.add_argument('-i', "--iter", default=0, type=int,
                        help='For accuracy measurement only.')
    parser.add_argument('--config', type=str, default='conf.yaml', help="yaml config file")
    parser.add_argument("--do_fp32_inference", action='store_true',
                        help="Whether to run fp32 inference.")
    parser.add_argument('--benchmark', dest='benchmark', action='store_true',
                        help='run benchmark')
    parser.add_argument('-r', "--accuracy_only", dest='accuracy_only', action='store_true',
                        help='For accuracy measurement only.')
    parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH',
                        help='path to checkpoint tuned by Low Precision Optimization Tool (default: ./)')
    parser.add_argument('--int8', dest='int8', action='store_true',
                        help='run benchmark')
    args = parser.parse_args()

    if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                         "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:
        raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
                         "or remove the --do_eval argument.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
                                          cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
                                                do_lower_case=args.do_lower_case,
                                                cache_dir=args.cache_dir if args.cache_dir else None)
    if args.block_size <= 0:
        args.block_size = tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        from_tf=bool('.ckpt' in args.model_name_or_path),
                                        config=config,
                                        cache_dir=args.cache_dir if args.cache_dir else None)
    model.to(args.device)

    if args.local_rank == 0:
        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        if args.local_rank not in [-1, 0]:
            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)

        if args.local_rank == 0:
            torch.distributed.barrier()
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)


    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
            
            if args.do_fp32_inference:
                model = model_class.from_pretrained(checkpoint)
                model.to(args.device)
                result = evaluate(args, model, tokenizer, prefix=prefix)
                result = dict((k + '_{}'.format(global_step), v.numpy()) for k, v in result.items())
                results.update(result)

            if args.tune:
                def eval_func_for_lpot(model):
                    result = evaluate(args, model, tokenizer, prefix=prefix)
                    return 100 - result['perplexity'].numpy()

                model = model_class.from_pretrained(checkpoint)
                model.to(args.device)
                model.eval()

                from lpot.experimental import Quantization, common
                quantizer = Quantization(args.config)
                eval_dataset = WikiDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
                args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
                # Note that DistributedSampler samples randomly
                eval_sampler = SequentialSampler(eval_dataset)
                eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
                quantizer.model = common.Model(model)
                quantizer.calib_dataloader = eval_dataloader
                quantizer.eval_func = eval_func_for_lpot
                q_model = quantizer()
                q_model.save(args.tuned_checkpoint)
                exit(0)

            if args.benchmark or args.accuracy_only:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)

                if args.int8:
                    from lpot.utils.pytorch import load
                    new_model = load(
                        os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model)
                else:
                    new_model = model
                result = evaluate(args, new_model, tokenizer, prefix=prefix)
                exit(0)
    return results
Exemple #16
0
        lpot_dataloader = yolo_dataLoader(dataloader)
        quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        quantizer.eval_func = eval_func
        quantizer.calib_dataloader = lpot_dataloader

        q_model = quantizer()
        q_model.save(opt.tuned_checkpoint)
        exit(0)

    if opt.benchmark:
        model.eval()
        model.fuse_model()
        if opt.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser(opt.tuned_checkpoint)), model)
        else:
            new_model = model
        evaluate(
            new_model,
            path=valid_path,
            iou_thres=opt.iou_thres,
            conf_thres=opt.conf_thres,
            nms_thres=opt.nms_thres,
            img_size=opt.img_size,
            batch_size=opt.batch_size,
            iter=opt.iter,
            warmup=opt.warmup_iter
        )
        exit(0)
Exemple #17
0
def main(config='config/blendcnn/mrpc/eval.json', args=None):
    cfg = Config(**json.load(open(config, "r")))

    cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r")))
    cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r")))
    cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r")))

    set_seeds(cfg.seed)

    TaskDataset = data.get_class(
        cfg_data.task)  # task dataset class according to the task
    tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file,
                                           do_lower_case=True)
    dataset = TaskDataset(
        args.dataset_location,
        pipelines=[
            data.RemoveSymbols('\\'),
            data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            data.AddSpecialTokensWithTruncation(cfg_data.max_len),
            data.TokenIndexing(tokenizer.convert_tokens_to_ids,
                               TaskDataset.labels, cfg_data.max_len)
        ],
        n_data=None)
    dataset = TensorDataset(*dataset.get_tensors())  # To Tensors
    data_iter = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)

    model = models.BlendCNN(cfg_model, len(TaskDataset.labels))
    checkpoint.load_embedding(model.embed, cfg.pretrain_file)

    optimizer = optim.optim4GPU(cfg_optim, model)

    train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, optimizer,
                                   cfg.save_dir, get_device())

    def get_loss(model, batch,
                 global_step):  # make sure loss is a scalar tensor
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        loss = nn.CrossEntropyLoss()(logits, label_id)
        return loss

    def evaluate(model, batch):
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float()  #.cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    class Bert_DataLoader(object):
        def __init__(self,
                     loader=None,
                     model_type=None,
                     device='cpu',
                     batch_size=1):
            self.loader = loader
            self.model_type = model_type
            self.device = device
            self.batch_size = batch_size

        def __iter__(self):
            for batch in self.loader:
                batch = tuple(t.to(self.device) for t in batch)
                outputs = {
                    'output_all': (batch[0], batch[1], batch[2]),
                    'labels': batch[3]
                }

                yield outputs['output_all'], outputs['labels']

    def benchmark(model):
        total_samples = 0
        total_time = 0
        index = 0

        class RandomDataset(object):
            def __init__(self, size, shape):
                self.len = size
                self.input_ids = torch.randint(low=0,
                                               high=30522,
                                               size=(size, shape),
                                               dtype=torch.int64)
                self.segment_ids = torch.randint(low=0,
                                                 high=1,
                                                 size=(size, shape),
                                                 dtype=torch.int64)
                self.input_mask = torch.randint(low=0,
                                                high=1,
                                                size=(size, shape),
                                                dtype=torch.int64)
                self.data = (self.input_ids, self.segment_ids, self.input_mask)

            def __getitem__(self, index):
                return (self.data[0][index], self.data[1][index],
                        self.data[2][index])

            def __len__(self):
                return self.len

        rand_loader = DataLoader(dataset=RandomDataset(size=5000, shape=128),
                                 batch_size=args.batch_size,
                                 shuffle=True)

        for batch in rand_loader:
            index += 1
            tic = time.time()
            if os.environ.get('BLENDCNN_PROFILING') is not None:
                with profiler.profile(record_shapes=True) as prof:
                    with torch.no_grad():
                        input_ids, segment_ids, input_mask = batch
                        _ = model(*batch)
            else:
                with torch.no_grad(
                ):  # evaluation without gradient calculation
                    input_ids, segment_ids, input_mask = batch
                    _ = model(*batch)
            if index > args.warmup:
                total_samples += batch[0].size()[0]
                total_time += time.time() - tic
        throughput = total_samples / total_time
        print('Latency: %.3f ms' % (1 / throughput * 1000))
        print('Throughput: %.3f images/sec' % (throughput))

        if os.environ.get('BLENDCNN_PROFILING') is not None:
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=10))

    def eval_func(model):
        results = []  # prediction results
        total_samples = 0
        total_time = 0
        index = 0
        model.eval()
        eval_dataloader = Bert_DataLoader(loader=data_iter,
                                          batch_size=args.batch_size)
        for batch, label in eval_dataloader:
            index += 1
            tic = time.time()
            if os.environ.get('BLENDCNN_PROFILING') is not None:
                with profiler.profile(record_shapes=True) as prof:
                    with torch.no_grad():
                        accuracy, result = evaluate(model, (*batch, label))
            else:
                with torch.no_grad(
                ):  # evaluation without gradient calculation
                    accuracy, result = evaluate(model, (*batch, label))
            results.append(result)
            if index > args.warmup:
                total_samples += batch[0].size()[0]
                total_time += time.time() - tic
        total_accuracy = torch.cat(results).mean().item()
        throughput = total_samples / total_time
        print('Latency: %.3f ms' % (1 / throughput * 1000))
        print('Throughput: %.3f samples/sec' % (throughput))
        print('Accuracy: %.3f ' % (total_accuracy))

        if os.environ.get('BLENDCNN_PROFILING') is not None:
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=10))
        return total_accuracy

    if cfg.mode == "train":
        train_loop.train(get_loss, cfg.model_file,
                         None)  # not use pretrain_file
        print("Training has been done properly.")

    elif cfg.mode == "eval":
        # results = train_loop.eval(evaluate, cfg.model_file)
        # total_accuracy = torch.cat(results).mean().item()
        # print(f"Accuracy: {total_accuracy}")

        if args.tune:
            import lpot
            from lpot import common
            # lpot tune
            model.load_state_dict(torch.load(args.input_model))
            eval_dataloader = Bert_DataLoader(loader=data_iter,
                                              batch_size=args.batch_size)

            quantizer = lpot.Quantization(args.tuned_yaml)
            quantizer.model = common.Model(model)
            quantizer.calib_dataloader = eval_dataloader
            quantizer.eval_func = eval_func
            q_model = quantizer()
            q_model.save(args.tuned_checkpoint)

        elif args.int8:
            from lpot.utils.pytorch import load
            int8_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
                model)
            print(int8_model)
            if args.accuracy_only:
                eval_func(int8_model)
            elif args.benchmark:
                benchmark(int8_model)

        else:
            model.load_state_dict(torch.load(args.input_model))
            print(model)
            if args.accuracy_only:
                eval_func(model)
            elif args.benchmark:
                benchmark(model)
Exemple #18
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list;"
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument("--mkldnn_eval",
                        action='store_true',
                        help="evaluation with MKLDNN")
    parser.add_argument("--mkldnn_train",
                        action='store_true',
                        help="training with MKLDNN")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument("--do_fp32_inference",
                        action='store_true',
                        help="Whether to run fp32 inference.")
    parser.add_argument("--do_calibration",
                        action='store_true',
                        help="Whether to do calibration.")
    parser.add_argument("--do_int8_inference",
                        action='store_true',
                        help="Whether to run int8 inference.")
    parser.add_argument("--do_bf16",
                        action='store_true',
                        help="run bf16 evaluation / training.")
    parser.add_argument(
        "--tune",
        action='store_true',
        help="run Low Precision Optimization Tool to tune int8 acc.")
    parser.add_argument("--warmup",
                        type=int,
                        default=2,
                        help="warmup for performance")
    parser.add_argument('-i',
                        "--iter",
                        default=0,
                        type=int,
                        help='For accuracy measurement only.')
    parser.add_argument('--config',
                        default='conf.yaml',
                        type=str,
                        help='yaml config file path')
    parser.add_argument('--benchmark',
                        dest='benchmark',
                        action='store_true',
                        help='run benchmark')
    parser.add_argument('-r',
                        "--accuracy_only",
                        dest='accuracy_only',
                        action='store_true',
                        help='For accuracy measurement only.')
    parser.add_argument(
        "--tuned_checkpoint",
        default='./saved_results',
        type=str,
        metavar='PATH',
        help=
        'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)'
    )
    parser.add_argument('--int8',
                        dest='int8',
                        action='store_true',
                        help='run benchmark')

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)
    mix_qkv = False
    if args.do_calibration or args.do_int8_inference or args.tune:
        mix_qkv = True
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        mix_qkv=mix_qkv,
        bf16=args.do_bf16,
        mkldnn_train=args.mkldnn_train,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""

            logger.info("Evaluate:" + args.task_name)
            if args.mkldnn_eval or args.do_fp32_inference or args.do_bf16:
                model = model_class.from_pretrained(checkpoint)
                model.to(args.device)
                result = evaluate(args, model, tokenizer, prefix=prefix)
                result = dict((k + '_{}'.format(global_step), v)
                              for k, v in result.items())
                results.update(result)

            if args.tune:

                def eval_func_for_lpot(model):
                    result, perf = evaluate(args,
                                            model,
                                            tokenizer,
                                            prefix=prefix)
                    bert_task_acc_keys = [
                        'acc_and_f1', 'f1', 'mcc', 'spearmanr', 'acc'
                    ]
                    for key in bert_task_acc_keys:
                        if key in result.keys():
                            logger.info("Finally Eval {}:{}".format(
                                key, result[key]))
                            acc = result[key]
                            break
                    return acc

                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                eval_task_names = (
                    "mnli", "mnli-mm") if args.task_name == "mnli" else (
                        args.task_name, )

                for eval_task in eval_task_names:
                    eval_dataset = load_and_cache_examples(args,
                                                           eval_task,
                                                           tokenizer,
                                                           evaluate=True)

                    args.eval_batch_size = args.per_gpu_eval_batch_size * max(
                        1, args.n_gpu)
                    # multi-gpu eval
                    if args.n_gpu > 1:
                        model = torch.nn.DataParallel(model)

                    if args.mkldnn_eval:
                        from torch.utils import mkldnn as mkldnn_utils
                        model = mkldnn_utils.to_mkldnn(model)
                        print(model)
                    from lpot import Quantization, common
                    quantizer = Quantization(args.config)
                    if eval_task != "squad":
                        eval_task = 'classifier'
                    eval_dataset = quantizer.dataset(
                        'bert',
                        dataset=eval_dataset,
                        task=eval_task,
                        model_type=args.model_type)
                    quantizer.model = common.Model(model)
                    quantizer.calib_dataloader = common.DataLoader(
                        eval_dataset, batch_size=args.eval_batch_size)
                    quantizer.eval_func = eval_func_for_lpot
                    q_model = quantizer()
                    q_model.save(args.tuned_checkpoint)
                exit(0)

            if args.benchmark or args.accuracy_only:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)

                if args.int8:
                    from lpot.utils.pytorch import load
                    new_model = load(
                        os.path.abspath(
                            os.path.expanduser(args.tuned_checkpoint)), model)
                else:
                    new_model = model
                result, _ = evaluate(args, new_model, tokenizer, prefix=prefix)
                exit(0)

            if args.do_calibration:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                fallback_layers = {}
                if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc":
                    fallback_layers = {"bert.encoder.layer.9.output.dense."}
                propagate_qconfig_(model)
                fallback_layer(model,
                               layer_name="",
                               exculde_layers=fallback_layers)
                add_observer_(model)
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step,
                                     calibration=True)
                convert(model, inplace=True)
                quantized_model_path = args.task_name + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    os.makedirs(quantized_model_path)
                model.save_pretrained(quantized_model_path)
                print(model)
                result, _ = evaluate(args, model, tokenizer, prefix=prefix)
            if args.do_int8_inference:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                fallback_layers = {}
                if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc":
                    fallback_layers = {"bert.encoder.layer.9.output.dense."}
                propagate_qconfig_(model)
                fallback_layer(model,
                               layer_name="",
                               exculde_layers=fallback_layers)
                add_observer_(model)
                convert(model, inplace=True)
                quantized_model_path = args.task_name + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    logger.error(
                        "please do calibrantion befor run int8 inference")
                    return
                prepare(model, inplace=True)
                convert(model, inplace=True)
                model_bin_file = os.path.join(quantized_model_path,
                                              "pytorch_model.bin")
                state_dict = torch.load(model_bin_file)
                model.load_state_dict(state_dict)
                result, _ = evaluate(args, model, tokenizer, prefix=prefix)

    return results
Exemple #19
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    print("Use CPU: {} for training".format(gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True, quantize=False)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallelCPU(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    if args.tune:

        def training_func_for_lpot(model):
            epochs = 8
            iters = 30
            optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
            for nepoch in range(epochs):
                model.train()
                cnt = 0
                for image, target in train_loader:
                    print('.', end='')
                    cnt += 1
                    output = model(image)
                    loss = criterion(output, target)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    if cnt >= iters:
                        break

                if nepoch > 3:
                    # Freeze quantizer parameters
                    model.apply(torch.quantization.disable_observer)
                if nepoch > 2:
                    # Freeze batch norm mean and variance estimates
                    model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)

            return

        model.module.fuse_model()
        from lpot.experimental import Quantization, common
        quantizer = Quantization(args.config)
        quantizer.model = common.Model(model)
        quantizer.q_func = training_func_for_lpot
        quantizer.eval_dataloader = val_loader
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)
        return

    if args.benchmark:
        model.eval()
        model.module.fuse_model()
        if args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
                model)
        else:
            new_model = model
        validate(val_loader, new_model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
Exemple #20
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help=
        "The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets."
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument(
        '--version_2_with_negative',
        action='store_true',
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json output file."
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument("--do_calibration",
                        action='store_true',
                        help="Whether to do calibration.")
    parser.add_argument("--do_int8_inference",
                        action='store_true',
                        help="Whether to run int8 inference.")
    parser.add_argument("--do_fp32_inference",
                        action='store_true',
                        help="Whether to run fp32 inference.")
    parser.add_argument("--mkldnn_eval",
                        action='store_true',
                        help="evaluation with MKLDNN")
    parser.add_argument(
        "--tune",
        action='store_true',
        help="run Low Precision Optimization Tool to tune int8 acc.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="SQuAD task")
    parser.add_argument("--warmup",
                        type=int,
                        default=5,
                        help="warmup for performance")
    parser.add_argument('-i',
                        "--iter",
                        default=0,
                        type=int,
                        help='For accuracy measurement only.')
    parser.add_argument('--benchmark',
                        dest='benchmark',
                        action='store_true',
                        help='run benchmark')
    parser.add_argument('-r',
                        "--accuracy_only",
                        dest='accuracy_only',
                        action='store_true',
                        help='For accuracy measurement only.')
    parser.add_argument(
        "--tuned_checkpoint",
        default='./',
        type=str,
        metavar='PATH',
        help=
        'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)'
    )
    parser.add_argument('--int8',
                        dest='int8',
                        action='store_true',
                        help='run benchmark')

    args = parser.parse_args()

    args.predict_file = os.path.join(
        args.output_dir, 'predictions_{}_{}.txt'.format(
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length)))

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    mix_qkv = False
    if args.do_calibration or args.do_int8_inference or args.tune:
        mix_qkv = True

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        mix_qkv=mix_qkv,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, 'einsum')
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir,
                                            force_download=True,
                                            mix_qkv=mix_qkv)
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce model loading logs

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            if args.mkldnn_eval or args.do_fp32_inference:
                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True)
                model.to(args.device)

                # Evaluate
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step)
                result = dict(
                    (k + ('_{}'.format(global_step) if global_step else ''), v)
                    for k, v in result.items())
                results.update(result)

            if args.tune:

                def eval_func_for_lpot(model):
                    result, _ = evaluate(args, model, tokenizer)
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                    bert_task_acc_keys = [
                        'best_f1', 'f1', 'mcc', 'spearmanr', 'acc'
                    ]
                    for key in bert_task_acc_keys:
                        if key in result.keys():
                            logger.info("Finally Eval {}:{}".format(
                                key, result[key]))
                            acc = result[key]
                            break
                    return acc

                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True,
                                                    mix_qkv=True)
                model.to(args.device)
                dataset = load_and_cache_examples(args,
                                                  tokenizer,
                                                  evaluate=True,
                                                  output_examples=False)
                args.eval_batch_size = args.per_gpu_eval_batch_size * max(
                    1, args.n_gpu)
                eval_task = "squad"
                from lpot import Quantization
                quantizer = Quantization("./conf.yaml")
                dataset = quantizer.dataset('bert',
                                            dataset=dataset,
                                            task=eval_task,
                                            model_type=args.model_type)
                test_dataloader = quantizer.dataloader(
                    dataset, batch_size=args.eval_batch_size)
                quantizer(model, test_dataloader, eval_func=eval_func_for_lpot)
                exit(0)

            if args.benchmark or args.accuracy_only:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                if args.int8:
                    from lpot.utils.pytorch import load
                    new_model = load(
                        os.path.abspath(
                            os.path.expanduser(args.tuned_checkpoint)), model)
                else:
                    new_model = model
                result, _ = evaluate(args,
                                     new_model,
                                     tokenizer,
                                     prefix=global_step)
                exit(0)

            if args.do_calibration:
                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True,
                                                    mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                propagate_qconfig_(model)
                add_observer_(model)
                # Evaluate
                evaluate(args,
                         model,
                         tokenizer,
                         prefix=global_step,
                         calibration=True)
                convert(model, inplace=True)
                quantized_model_path = "squad" + str(
                    global_step) + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    os.makedirs(quantized_model_path)
                model.save_pretrained(quantized_model_path)
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step)
                result = dict(
                    (k + ('_{}'.format(global_step) if global_step else ''), v)
                    for k, v in result.items())
                results.update(result)
            if args.do_int8_inference:
                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True,
                                                    mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                propagate_qconfig_(model)
                add_observer_(model)
                convert(model, inplace=True)
                quantized_model_path = "squad" + str(
                    global_step) + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    logger.info("Please run calibration first!")
                    return
                model_bin_file = os.path.join(quantized_model_path,
                                              "pytorch_model.bin")
                state_dict = torch.load(model_bin_file)
                model.load_state_dict(state_dict)
                print(model)
                with torch.autograd.profiler.profile() as prof:
                    result, _ = evaluate(args,
                                         model,
                                         tokenizer,
                                         prefix=global_step)
                print(prof.key_averages().table(sort_by="cpu_time_total"))
                result = dict(
                    (k + ('_{}'.format(global_step) if global_step else ''), v)
                    for k, v in result.items())
                results.update(result)
    logger.info("Results: {}".format(results))

    return results
Exemple #21
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = (data_args.train_file.split(".")[-1]
                     if data_args.train_file is not None else
                     data_args.validation_file.split(".")[-1])
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            **config_kwargs)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            **config_kwargs)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
        "use_fast": model_args.use_fast_tokenizer,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name,
                                                  **tokenizer_kwargs)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, **tokenizer_kwargs)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForCausalLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    def tokenize_function(examples):
        return tokenizer(examples[text_column_name])

    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    if data_args.block_size is None:
        block_size = tokenizer.model_max_length
        if block_size > 1024:
            logger.warn(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
            )
        block_size = 1024
    else:
        if data_args.block_size > tokenizer.model_max_length:
            logger.warn(
                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
            )
        block_size = min(data_args.block_size, tokenizer.model_max_length)

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {
            k: sum(examples[k], [])
            for k in examples.keys()
        }
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k:
            [t[i:i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
    # to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_datasets["train"] if training_args.do_train else None,
        eval_dataset=lm_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it.
        data_collator=default_data_collator,
    )
    train_dataset = lm_datasets["train"]
    eval_dataset = lm_datasets["validation"]
    if training_args.tune:

        def eval_func_for_lpot(model_tuned):
            trainer = Trainer(
                model=model_tuned,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                tokenizer=tokenizer,
                data_collator=default_data_collator,
            )
            eval_output = trainer.evaluate(eval_dataset=eval_dataset)
            perplexity = math.exp(eval_output["eval_loss"])
            results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\
                        "eval_samples_per_second":eval_output['eval_samples_per_second']}
            clm_task_metrics_keys = ["perplexity"]
            for key in clm_task_metrics_keys:
                if key in results.keys():
                    logger.info("Finally Eval {}:{}".format(key, results[key]))
                    if key == "perplexity":
                        perplexity = results[key]
                        break
            return 100 - perplexity

        from lpot.experimental import Quantization, common
        quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        quantizer.calib_dataloader = common.DataLoader(
            eval_dataset,
            batch_size=training_args.eval_batch_size,
            collate_fn=default_data_collator_lpot)
        quantizer.eval_func = eval_func_for_lpot
        q_model = quantizer()
        q_model.save(training_args.tuned_checkpoint)
        exit(0)

    if training_args.accuracy_only:
        if training_args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(
                    os.path.expanduser(training_args.tuned_checkpoint)), model)
        else:
            new_model = model
        trainer = Trainer(
            model=new_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            data_collator=default_data_collator,
        )
        eval_output = trainer.evaluate(eval_dataset=eval_dataset)
        perplexity = math.exp(eval_output["eval_loss"])
        results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\
                    "eval_samples_per_second":eval_output['eval_samples_per_second']}
        clm_task_metrics_keys = ["perplexity"]
        for key in clm_task_metrics_keys:
            if key in results.keys():
                acc = results[key]
                break
        print("Accuracy: %.5f" % acc)
        print('Throughput: %.3f samples/sec' %
              (results["eval_samples_per_second"]))
        print('Latency: %.3f ms' %
              (1 * 1000 / results["eval_samples_per_second"]))
        print('Batch size = %d' % training_args.per_device_eval_batch_size)
        exit(0)

    if training_args.benchmark:
        if training_args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(
                    os.path.expanduser(training_args.tuned_checkpoint)), model)
        else:
            new_model = model
        trainer = Trainer(
            model=new_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            data_collator=default_data_collator,
        )
        eval_output = trainer.evaluate(eval_dataset=eval_dataset,
                                       iters=training_args.iters,
                                       warmup_iter=training_args.warmup_iter)
        perplexity = math.exp(eval_output["eval_loss"])
        results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\
                    "eval_samples_per_second":eval_output['eval_samples_per_second']}
        clm_task_metrics_keys = ["perplexity"]
        for key in clm_task_metrics_keys:
            if key in results.keys():
                acc = results[key]
                break
        print("Accuracy: %.5f" % acc)
        print('Throughput: %.3f samples/sec' %
              (results["eval_samples_per_second"]))
        print('Latency: %.3f ms' %
              (1 * 1000 / results["eval_samples_per_second"]))
        print('Batch size = %d' % training_args.per_device_eval_batch_size)
        exit(0)

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif model_args.model_name_or_path is not None and os.path.isdir(
                model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

        metrics = train_result.metrics

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        trainer.log_metrics("eval", results)
        trainer.save_metrics("eval", results)

    return results