def test_quantization_saved(self): from lpot.utils.pytorch import load model = copy.deepcopy(self.model) for fake_yaml in ['qat_yaml.yaml', 'ptq_yaml.yaml']: if fake_yaml == 'ptq_yaml.yaml': model.eval().fuse_model() quantizer = Quantization(fake_yaml) dataset = quantizer.dataset('dummy', (100, 3, 256, 256), label=True) quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader(dataset) quantizer.eval_dataloader = common.DataLoader(dataset) if fake_yaml == 'qat_yaml.yaml': quantizer.q_func = q_func q_model = quantizer() q_model.save('./saved') # Load configure and weights by lpot.utils saved_model = load("./saved", model) eval_func(saved_model) from lpot import Benchmark evaluator = Benchmark('ptq_yaml.yaml') # Load configure and weights by lpot.model evaluator.model = common.Model(model) evaluator.b_dataloader = common.DataLoader(dataset) results = evaluator() evaluator.model = common.Model(model) fp32_results = evaluator() self.assertTrue( (fp32_results['accuracy'][0] - results['accuracy'][0]) < 0.01)
def test_tuning_ipex(self): from lpot import Quantization model = torchvision.models.resnet18() model = MODELS['pytorch_ipex'](model) quantizer = Quantization('ipex_yaml.yaml') dataset = quantizer.dataset('dummy', (100, 3, 256, 256), label=True) quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader(dataset) quantizer.eval_dataloader = common.DataLoader(dataset) lpot_model = quantizer() lpot_model.save("./saved") new_model = MODELS['pytorch_ipex'](model.model, { "workspace_path": "./saved" }) new_model.model.to(ipex.DEVICE) try: script_model = torch.jit.script(new_model.model) except: script_model = torch.jit.trace( new_model.model, torch.randn(10, 3, 224, 224).to(ipex.DEVICE)) from lpot import Benchmark evaluator = Benchmark('ipex_yaml.yaml') evaluator.model = common.Model(script_model) evaluator.b_dataloader = common.DataLoader(dataset) results = evaluator()
def test_mlp_model_quantization(self): """ Use MLP model to test minmax calibration and built-in evaluate function. """ for shape in [(500, 1000),]: arg_shapes, label_shape, _ = self.mlp_model.infer_shape(data=shape) mod = mx.mod.Module(symbol=self.mlp_model, context=mx.current_context()) mod.bind(for_training=False, data_shapes=[('data', arg_shapes[0])], label_shapes=[('softmax_label', label_shape[0])]) mod.init_params() arg_params, aux_params = mod.get_params() data = mx.nd.random.uniform(low=self.data_low, high=self.data_high, shape=shape).astype('float32') labels = mx.nd.ones([shape[0], ]) calib_data = mx.io.NDArrayIter(data=data, label=labels, batch_size=shape[0]) fp32_model = (self.mlp_model, arg_params, aux_params) self.quantizer_1.model = common.Model(fp32_model) self.quantizer_1.calib_dataloader = calib_data self.quantizer_1.eval_dataloader = calib_data qmodel = self.quantizer_1() self.assertIsInstance(qmodel.model[0], mx.symbol.Symbol)
def test_gluon_model(self): """ Use gluon model to test gluon related functions in mxnet adaptor. """ # create gluon model net = nn.HybridSequential() net.add(nn.Dense(128, activation="relu")) net.add(nn.Dense(64, activation="relu")) net.add(nn.Dense(10)) net.initialize() class Quant_dataloader(): def __init__(self, dataset, batch_size=1): self.dataset = dataset self.batch_size = batch_size def __iter__(self): for data, label in self.dataset: yield data, label def __getitem__(self): pass valid_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=False) q_dataloader = Quant_dataloader(valid_dataset) self.quantizer_1.model = common.Model(net) self.quantizer_1.calib_dataloader = q_dataloader self.quantizer_1.eval_func = eval_func qmodel = self.quantizer_1() self.assertIsInstance(qmodel.model, mx.gluon.HybridBlock)
def main(): import lpot from lpot import common quantizer = lpot.Quantization('./conf.yaml') quantizer.model = common.Model("./mobilenet_v1_1.0_224_frozen.pb") quantized_model = quantizer()
def main(): class CalibrationDL(): def __init__(self): path = os.path.abspath( os.path.expanduser('./brats_cal_images_list.txt')) with open(path, 'r') as f: self.preprocess_files = [line.rstrip() for line in f] self.loaded_files = {} self.batch_size = 1 def __getitem__(self, sample_id): file_name = self.preprocess_files[sample_id] print("Loading file {:}".format(file_name)) with open( os.path.join('build/calib_preprocess/', "{:}.pkl".format(file_name)), "rb") as f: self.loaded_files[sample_id] = pickle.load(f)[0] return torch.from_numpy( self.loaded_files[sample_id][np.newaxis, ...]).float(), None def __len__(self): self.count = len(self.preprocess_files) return self.count args = get_args() assert args.backend == "pytorch" model_path = os.path.join(args.model_dir, "plans.pkl") assert os.path.isfile( model_path), "Cannot find the model file {:}!".format(model_path) trainer, params = load_model_and_checkpoint_files( args.model_dir, folds=1, fp16=False, checkpoint_name='model_final_checkpoint') trainer.load_checkpoint_ram(params[0], False) model = trainer.network if args.tune: quantizer = Quantization('conf.yaml') quantizer.model = common.Model(model) quantizer.eval_func = eval_func calib_dl = CalibrationDL() quantizer.calib_dataloader = calib_dl q_model = quantizer() q_model.save('./lpot_workspace') exit(0) if args.benchmark: model.eval() if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath(os.path.expanduser('./lpot_workspace')), model) else: new_model = model eval_func(new_model)
def main(): import lpot quantizer = lpot.Quantization('./conf.yaml') dataset = quantizer.dataset('dummy', shape=(100, 100, 100, 3), label=True) quantizer.model = common.Model( './model/public/rfcn-resnet101-coco-tf/model/public/rfcn-resnet101-coco-tf/rfcn_resnet101_coco_2018_01_28/' ) quantizer.calib_dataloader = common.DataLoader(dataset) quantized_model = quantizer()
def main(): import lpot from lpot import common quantizer = lpot.Quantization('./conf.yaml') quantizer.model = common.Model("./mobilenet_v1_1.0_224_frozen.pb") quantized_model = quantizer() # Optional, run benchmark from lpot import Benchmark evaluator = Benchmark('./conf.yaml') evaluator.model = common.Model(quantized_model) results = evaluator() batch_size = 1 for mode, result in results.items(): acc, batch_size, result_list = result latency = np.array(result_list).mean() / batch_size print('Accuracy is {:.3f}'.format(acc)) print('Latency: {:.3f} ms'.format(latency * 1000))
def test_tensor_dump(self): model = copy.deepcopy(self.lpot_model) model.model.eval().fuse_model() quantizer = Quantization('dump_yaml.yaml') dataset = quantizer.dataset('dummy', (100, 3, 256, 256), label=True) quantizer.model = common.Model(model.model) quantizer.calib_dataloader = common.DataLoader(dataset) quantizer.eval_func = eval_func quantizer() self.assertTrue( True if os.path.exists('runs/eval/baseline_acc0.0') else False) quantizer.eval_dataloader = common.DataLoader(dataset) quantizer() self.assertTrue( True if os.path.exists('runs/eval/baseline_acc0.0') else False)
def test_quantizate(self): from lpot import Quantization, common for fake_yaml in ["static_yaml.yaml", "dynamic_yaml.yaml"]: quantizer = Quantization(fake_yaml) dataset = quantizer.dataset("dummy", (100, 3, 224, 224), low=0., high=1., label=True) quantizer.calib_dataloader = common.DataLoader(dataset) quantizer.eval_dataloader = common.DataLoader(dataset) quantizer.model = common.Model(self.rn50_model) q_model = quantizer() eval_func(q_model) for fake_yaml in ["non_MSE_yaml.yaml"]: quantizer = Quantization(fake_yaml) dataset = quantizer.dataset("dummy", (100, 3, 224, 224), low=0., high=1., label=True) quantizer.calib_dataloader = common.DataLoader(dataset) quantizer.eval_dataloader = common.DataLoader(dataset) quantizer.model = common.Model(self.mb_v2_model) q_model = quantizer() eval_func(q_model)
def run(self): """ This is lpot function include tuning and benchmark option """ if self.args.tune: from lpot import Quantization, common quantizer = Quantization(self.args.config) quantizer.model = common.Model(self.args.input_graph) q_model = quantizer() q_model.save(self.args.output_graph) if self.args.benchmark: from lpot import Benchmark, common evaluator = Benchmark(self.args.config) evaluator.model = common.Model(self.args.input_graph) results = evaluator() for mode, result in results.items(): acc, batch_size, result_list = result latency = np.array(result_list).mean() / batch_size print('\n{} mode benchmark result:'.format(mode)) print('Accuracy is {:.3f}'.format(acc)) print('Batch size = {}'.format(batch_size)) print('Latency: {:.3f} ms'.format(latency * 1000)) print('Throughput: {:.3f} images/sec'.format(1./ latency))
def main(): import lpot from lpot import common quantizer = lpot.Quantization('./conf.yaml') # Get graph from slim checkpoint from tf_slim.nets import inception model_func = inception.inception_v1 arg_scope = inception.inception_v1_arg_scope() kwargs = {'num_classes': 1001} inputs_shape = [None, 224, 224, 3] images = tf.compat.v1.placeholder(name='input', \ dtype=tf.float32, shape=inputs_shape) # Do quantization quantizer.model = common.Model('./inception_v1.ckpt') quantized_model = quantizer()
def tune_model( input_graph: str, output_graph: str, config: str, framework: str, ) -> None: """Execute tuning.""" from lpot import Quantization, common if framework == "onnxrt": import onnx input_graph = onnx.load(input_graph) quantizer = Quantization(config) quantizer.model = common.Model(input_graph) quantized_model = quantizer() quantized_model.save(output_graph)
def main(_): graph = load_graph(FLAGS.input_graph) if FLAGS.mode == 'tune': from lpot import Quantization, common quantizer = Quantization(FLAGS.config) ds = Dataset(FLAGS.inputs_file, FLAGS.reference_file, FLAGS.vocab_file) quantizer.calib_dataloader = common.DataLoader(ds, collate_fn=collate_fn, \ batch_size=FLAGS.batch_size) quantizer.model = common.Model(graph) quantizer.eval_func = eval_func q_model = quantizer() try: q_model.save(FLAGS.output_model) except Exception as e: print("Failed to save model due to {}".format(str(e))) elif FLAGS.mode == 'benchmark': eval_func(graph, FLAGS.iters) elif FLAGS.mode == 'accuracy': eval_func(graph, -1)
def benchmark_model( input_graph: str, config: str, benchmark_mode: str, framework: str, datatype: str = "", ) -> List[Dict[str, Any]]: """Execute benchmark.""" from lpot import Benchmark, common benchmark_results = [] if framework == "onnxrt": import onnx input_graph = onnx.load(input_graph) evaluator = Benchmark(config) evaluator.model = common.Model(input_graph) results = evaluator() for mode, result in results.items(): if benchmark_mode == mode: log.info(f"Mode: {mode}") acc, batch_size, result_list = result latency = (sum(result_list) / len(result_list)) / batch_size log.info(f"Batch size: {batch_size}") if mode == "accuracy": log.info(f"Accuracy: {acc:.3f}") elif mode == "performance": log.info(f"Latency: {latency * 1000:.3f} ms") log.info(f"Throughput: {1. / latency:.3f} images/sec") benchmark_results.append( { "precision": datatype, "mode": mode, "batch_size": batch_size, "accuracy": acc, "latency": latency * 1000, "throughput": 1.0 / latency, }, ) return benchmark_results
def test_conv_model_quantization(self): """ Use Conv model to test KL calibration and user specific evaluate function. """ for shape in [(500, 3, 224, 224),]: arg_shapes, _, _ = self.conv_model.infer_shape(data=shape) mod = mx.mod.Module(symbol=self.conv_model, context=mx.current_context()) mod.bind(for_training=False, data_shapes=[('data', arg_shapes[0])]) mod.init_params() arg_params, aux_params = mod.get_params() data = mx.nd.random.uniform(low=self.data_low, high=self.data_high, shape=shape).astype('float32') calib_data = mx.io.NDArrayIter(data=data, batch_size=shape[0]) fp32_model = (self.conv_model, arg_params, aux_params) self.quantizer_2.model = common.Model(fp32_model) self.quantizer_2.calib_dataloader = calib_data self.quantizer_2.eval_dataloader = calib_data self.quantizer_2.eval_func = eval_func qmodel = self.quantizer_2() # test inspected_tensor inspect_tensor = self.quantizer_2.strategy.adaptor.inspect_tensor self.quantizer_2.model = fp32_model inspected_tensor = inspect_tensor(self.quantizer_2.model, calib_data, op_list=[('sg_mkldnn_conv_bn_act_0_output', 'CONV'), ('data', 'input')], iteration_list=[0, 2, 4]) inspected_qtensor = inspect_tensor(qmodel, calib_data, op_list=[('quantized_sg_mkldnn_conv_bn_act_0_output', 'CONV')], iteration_list=[0]) self.assertNotEqual(len(inspected_tensor), 0) self.assertNotEqual(len(inspected_qtensor), 0) self.assertIsInstance(qmodel.model[0], mx.symbol.Symbol)
def test_pruning(self): from lpot import Pruning, common prune = Pruning('fake.yaml') dummy_dataset = PyTorchDummyDataset([tuple([100, 3, 256, 256])]) dummy_dataloader = PyTorchDataLoader(dummy_dataset) def training_func_for_lpot(model): epochs = 16 iters = 30 criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) for nepoch in range(epochs): model.train() cnt = 0 prune.on_epoch_begin(nepoch) for image, target in dummy_dataloader: prune.on_batch_begin(cnt) print('.', end='') cnt += 1 output = model(image) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() prune.on_batch_end() if cnt >= iters: break prune.on_epoch_end() dummy_dataset = PyTorchDummyDataset(tuple([100, 3, 256, 256]), label=True) dummy_dataloader = PyTorchDataLoader(dummy_dataset) prune.model = common.Model(self.model) prune.q_func = training_func_for_lpot prune.eval_dataloader = dummy_dataloader _ = prune()
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help= "The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( '--version_2_with_negative', action='store_true', help= 'If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json output file." ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument("--do_calibration", action='store_true', help="Whether to do calibration.") parser.add_argument("--do_int8_inference", action='store_true', help="Whether to run int8 inference.") parser.add_argument("--do_fp32_inference", action='store_true', help="Whether to run fp32 inference.") parser.add_argument("--mkldnn_eval", action='store_true', help="evaluation with MKLDNN") parser.add_argument( "--tune", action='store_true', help="run Low Precision Optimization Tool to tune int8 acc.") parser.add_argument("--task_name", default=None, type=str, required=True, help="SQuAD task") parser.add_argument("--warmup", type=int, default=5, help="warmup for performance") parser.add_argument('-i', "--iter", default=0, type=int, help='For accuracy measurement only.') parser.add_argument('--config', type=str, default='conf.yaml', help="yaml config file") parser.add_argument('--benchmark', dest='benchmark', action='store_true', help='run benchmark') parser.add_argument('-r', "--accuracy_only", dest='accuracy_only', action='store_true', help='For accuracy measurement only.') parser.add_argument( "--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', help= 'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)' ) parser.add_argument('--int8', dest='int8', action='store_true', help='run benchmark') args = parser.parse_args() args.predict_file = os.path.join( args.output_dir, 'predictions_{}_{}.txt'.format( list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) mix_qkv = False if args.do_calibration or args.do_int8_inference or args.tune: mix_qkv = True # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, mix_qkv=mix_qkv, cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, 'einsum') except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir, force_download=True, mix_qkv=mix_qkv) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce model loading logs logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" if args.mkldnn_eval or args.do_fp32_inference: model = model_class.from_pretrained(checkpoint, force_download=True) model.to(args.device) # Evaluate result, _ = evaluate(args, model, tokenizer, prefix=global_step) result = dict( (k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) results.update(result) if args.tune: def eval_func_for_lpot(model): result, _ = evaluate(args, model, tokenizer) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) bert_task_acc_keys = [ 'best_f1', 'f1', 'mcc', 'spearmanr', 'acc' ] for key in bert_task_acc_keys: if key in result.keys(): logger.info("Finally Eval {}:{}".format( key, result[key])) acc = result[key] break return acc model = model_class.from_pretrained(checkpoint, force_download=True, mix_qkv=True) model.to(args.device) dataset = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=False) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) eval_task = "squad" from lpot import Quantization, common quantizer = Quantization(args.config) dataset = quantizer.dataset('bert', dataset=dataset, task=eval_task, model_type=args.model_type) quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader( dataset, batch_size=args.eval_batch_size) quantizer.eval_func = eval_func_for_lpot q_model = quantizer() q_model.save(args.tuned_checkpoint) exit(0) if args.benchmark or args.accuracy_only: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(args.tuned_checkpoint)), model) else: new_model = model result, _ = evaluate(args, new_model, tokenizer, prefix=global_step) exit(0) if args.do_calibration: model = model_class.from_pretrained(checkpoint, force_download=True, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig propagate_qconfig_(model) add_observer_(model) # Evaluate evaluate(args, model, tokenizer, prefix=global_step, calibration=True) convert(model, inplace=True) quantized_model_path = "squad" + str( global_step) + "_quantized_model" if not os.path.exists(quantized_model_path): os.makedirs(quantized_model_path) model.save_pretrained(quantized_model_path) result, _ = evaluate(args, model, tokenizer, prefix=global_step) result = dict( (k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) results.update(result) if args.do_int8_inference: model = model_class.from_pretrained(checkpoint, force_download=True, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig propagate_qconfig_(model) add_observer_(model) convert(model, inplace=True) quantized_model_path = "squad" + str( global_step) + "_quantized_model" if not os.path.exists(quantized_model_path): logger.info("Please run calibration first!") return model_bin_file = os.path.join(quantized_model_path, "pytorch_model.bin") state_dict = torch.load(model_bin_file) model.load_state_dict(state_dict) print(model) with torch.autograd.profiler.profile() as prof: result, _ = evaluate(args, model, tokenizer, prefix=global_step) print(prof.key_averages().table(sort_by="cpu_time_total")) result = dict( (k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) return results
parser.add_argument( '--tune', action='store_true', \ default=False, help="whether quantize the model" ) parser.add_argument('--config', type=str, help="config yaml path") parser.add_argument('--output_model', type=str, help="output model path") args = parser.parse_args() model = onnx.load(args.model_path) if args.benchmark: from lpot import Benchmark, common evaluator = Benchmark(args.config) evaluator.model = common.Model(model) results = evaluator() for mode, result in results.items(): acc, batch_size, result_list = result latency = np.array(result_list).mean() / batch_size print('\n{} mode benchmark result:'.format(mode)) print('Accuracy is {:.3f}'.format(acc)) print('Batch size = {}'.format(batch_size)) print('Latency: {:.3f} ms'.format(latency * 1000)) print('Throughput: {:.3f} images/sec'.format(batch_size * 1. / latency)) if args.tune: from lpot import Quantization, common
def main_worker(gpu, ngpus_per_node, args): global best_acc1 #args.gpu = gpu #affinity = subprocess.check_output("lscpu | grep 'NUMA node[0-9]' | awk '{ print $4 }' | awk -F',' '{ print $1 }'", shell=True) #os.environ['OMP_NUM_THREADS'] = '28' #os.environ['KMP_AFFINITY'] = 'proclist=[{}],granularity=thread,explicit'.format(affinity.splitlines()[gpu].decode('utf-8')) #print (os.environ['KMP_AFFINITY']) #if args.gpu is not None: # print("Use GPU: {} for training".format(args.gpu)) print("Use CPU: {} for training".format(gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) if args.ipex: model = models.__dict__[args.arch](pretrained=True) else: model = quantize_models.__dict__[args.arch](pretrained=True, quantize=False) else: print("=> creating model '{}'".format(args.arch)) if args.ipex: model = models.__dict__[args.arch]() else: model = quantize_models.__dict__[args.arch]() if not torch.cuda.is_available(): print('using CPU...') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: #model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallelCPU(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() #criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) #cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) if args.tune: from lpot import Quantization, common if args.ipex: quantizer = Quantization("./conf_ipex.yaml") else: model.eval() model.fuse_model() quantizer = Quantization("./conf.yaml") quantizer.model = common.Model(model) q_model = quantizer() q_model.save(args.tuned_checkpoint) return if args.benchmark or args.accuracy_only: model.eval() ipex_config_path = None if args.int8: if args.ipex: # TODO: It will remove when IPEX spport to save script model. model.to(ipex.DEVICE) try: new_model = torch.jit.script(model) except: new_model = torch.jit.trace(model, torch.randn(1, 3, 224, 224).to(ipex.DEVICE)) ipex_config_path = os.path.join(os.path.expanduser(args.tuned_checkpoint), "best_configure.json") else: model.fuse_model() from lpot.utils.pytorch import load new_model = load( os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) else: if args.ipex: # TODO: It will remove when IPEX spport to save script model. model.to(ipex.DEVICE) try: new_model = torch.jit.script(model) except: new_model = torch.jit.trace(model, torch.randn(1, 3, 224, 224).to(ipex.DEVICE)) else: model.fuse_model() new_model = model validate(val_loader, new_model, criterion, args, ipex_config_path) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best)
def main_worker(gpu, args): global best_acc1 print("Use CPU: {} for training".format(gpu)) if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True, quantize=False) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return if args.prune: from lpot import Pruning, common prune = Pruning(args.config) def training_func_for_lpot(model): epochs = 16 iters = 30 optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) for nepoch in range(epochs): model.train() cnt = 0 prune.on_epoch_begin(nepoch) for image, target in train_loader: prune.on_batch_begin(cnt) print('.', end='') cnt += 1 output = model(image) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() prune.on_batch_end() if cnt >= iters: break prune.on_epoch_end() if nepoch > 3: # Freeze quantizer parameters model.apply(torch.quantization.disable_observer) if nepoch > 2: # Freeze batch norm mean and variance estimates model.apply(torch.nn.intrinsic.qat.freeze_bn_stats) validate(val_loader, model, criterion, args) return prune.model = common.Model(model) prune.eval_dataloader = val_loader prune.q_func = training_func_for_lpot q_model = prune() return
EM_acc = evaluate(model) return EM_acc if __name__ == '__main__': if only_calibration: try: calibration(net, num_calib_batches, quantized_dtype, calib_mode) except AttributeError: nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx) warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115') elif not only_predict: train() evaluate() elif args.tune: # lpot auto-tuning dev_dataloader = gen_dataset() from lpot import Quantization, common quantizer = Quantization("./bert.yaml") quantizer.model = common.Model(net) quantizer.calib_dataloader = dev_dataloader quantizer.eval_dataloader = dev_dataloader quantizer.eval_func = eval_func q_model = quantizer() q_model.save(args.output_dir) elif model_parameters or deploy: evaluate()
logger.info("speed is %.2f samples/s" % speed) return acc if __name__ == '__main__': if only_calibration: try: calibration(model, dev_data_list, num_calib_batches, quantized_dtype, calib_mode) except AttributeError: nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx) warnings.warn( 'INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115') elif args.tune: # lpot auto-tuning if only_inference: calib_data = dev_data_list[0][1] from lpot import Quantization, common quantizer = Quantization("./bert.yaml") quantizer.model = common.Model(model) quantizer.calib_dataloader = calib_data quantizer.eval_dataloader = calib_data quantizer.eval_func = test_func q_model = quantizer() q_model.save(args.output_dir) else: train(task.metrics)
pass def reset(self): self.pred_list = [] self.label_list = [] self.samples = 0 pass def result(self): correct_num = np.sum( np.array(self.pred_list) == np.array(self.label_list)) return correct_num / self.samples # Quantize with customized dataloader and metric quantizer = lpot.Quantization('./conf.yaml') dataset = Dataset() quantizer.metric = common.Metric(MyMetric, 'hello_metric') quantizer.calib_dataloader = common.DataLoader(dataset, batch_size=1) quantizer.eval_dataloader = common.DataLoader(dataset, batch_size=1) quantizer.model = common.Model('../models/simple_model') q_model = quantizer() # Optional, run quantized model import tensorflow as tf with tf.compat.v1.Graph().as_default(), tf.compat.v1.Session() as sess: tf.compat.v1.import_graph_def(q_model.as_graph_def(), name='') styled_image = sess.run(['output:0'], feed_dict={'input:0': dataset.test_images}) print("Inference is done.")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list;" + ", ".join(ALL_MODELS)) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument("--mkldnn_eval", action='store_true', help="evaluation with MKLDNN") parser.add_argument("--mkldnn_train", action='store_true', help="training with MKLDNN") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument("--do_fp32_inference", action='store_true', help="Whether to run fp32 inference.") parser.add_argument("--do_calibration", action='store_true', help="Whether to do calibration.") parser.add_argument("--do_int8_inference", action='store_true', help="Whether to run int8 inference.") parser.add_argument("--do_bf16", action='store_true', help="run bf16 evaluation / training.") parser.add_argument( "--tune", action='store_true', help="run Low Precision Optimization Tool to tune int8 acc.") parser.add_argument("--warmup", type=int, default=2, help="warmup for performance") parser.add_argument('-i', "--iter", default=0, type=int, help='For accuracy measurement only.') parser.add_argument('--config', default='conf.yaml', type=str, help='yaml config file path') parser.add_argument('--benchmark', dest='benchmark', action='store_true', help='run benchmark') parser.add_argument('-r', "--accuracy_only", dest='accuracy_only', action='store_true', help='For accuracy measurement only.') parser.add_argument( "--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', help= 'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)' ) parser.add_argument('--int8', dest='int8', action='store_true', help='run benchmark') args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) mix_qkv = False if args.do_calibration or args.do_int8_inference or args.tune: mix_qkv = True # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, mix_qkv=mix_qkv, bf16=args.do_bf16, mkldnn_train=args.mkldnn_train, cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" logger.info("Evaluate:" + args.task_name) if args.mkldnn_eval or args.do_fp32_inference or args.do_bf16: model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) if args.tune: def eval_func_for_lpot(model): result, perf = evaluate(args, model, tokenizer, prefix=prefix) bert_task_acc_keys = [ 'acc_and_f1', 'f1', 'mcc', 'spearmanr', 'acc' ] for key in bert_task_acc_keys: if key in result.keys(): logger.info("Finally Eval {}:{}".format( key, result[key])) acc = result[key] break return acc model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) eval_task_names = ( "mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) for eval_task in eval_task_names: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) from lpot import Quantization, common quantizer = Quantization(args.config) if eval_task != "squad": eval_task = 'classifier' eval_dataset = quantizer.dataset( 'bert', dataset=eval_dataset, task=eval_task, model_type=args.model_type) quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader( eval_dataset, batch_size=args.eval_batch_size) quantizer.eval_func = eval_func_for_lpot q_model = quantizer() q_model.save(args.tuned_checkpoint) exit(0) if args.benchmark or args.accuracy_only: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(args.tuned_checkpoint)), model) else: new_model = model result, _ = evaluate(args, new_model, tokenizer, prefix=prefix) exit(0) if args.do_calibration: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig fallback_layers = {} if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc": fallback_layers = {"bert.encoder.layer.9.output.dense."} propagate_qconfig_(model) fallback_layer(model, layer_name="", exculde_layers=fallback_layers) add_observer_(model) result, _ = evaluate(args, model, tokenizer, prefix=global_step, calibration=True) convert(model, inplace=True) quantized_model_path = args.task_name + "_quantized_model" if not os.path.exists(quantized_model_path): os.makedirs(quantized_model_path) model.save_pretrained(quantized_model_path) print(model) result, _ = evaluate(args, model, tokenizer, prefix=prefix) if args.do_int8_inference: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig fallback_layers = {} if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc": fallback_layers = {"bert.encoder.layer.9.output.dense."} propagate_qconfig_(model) fallback_layer(model, layer_name="", exculde_layers=fallback_layers) add_observer_(model) convert(model, inplace=True) quantized_model_path = args.task_name + "_quantized_model" if not os.path.exists(quantized_model_path): logger.error( "please do calibrantion befor run int8 inference") return prepare(model, inplace=True) convert(model, inplace=True) model_bin_file = os.path.join(quantized_model_path, "pytorch_model.bin") state_dict = torch.load(model_bin_file) model.load_state_dict(state_dict) result, _ = evaluate(args, model, tokenizer, prefix=prefix) return results
def main(): global args, best_acc1 args = parser.parse_args() print('args:', args) args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # Val data loading valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(args.input_dim + 32), transforms.CenterCrop(args.input_dim), transforms.ToTensor(), normalize, ])) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) num_classes = len(val_dataset.classes) print('Total classes: ', num_classes) # create model print("=> creating model '{}'".format(args.arch)) if args.arch == 'peleenet': model = PeleeNet(num_classes=num_classes) else: print( "=> unsupported model '{}'. creating PeleeNet by default.".format( args.arch)) model = PeleeNet(num_classes=num_classes) if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model) else: # DataParallel will divide model = torch.nn.DataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) elif args.pretrained: if os.path.isfile(args.weights): checkpoint = torch.load(args.weights, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {}, acc@1 {})".format( args.pretrained, checkpoint['epoch'], checkpoint['best_acc1'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.evaluate: validate(val_loader, model, criterion, args) return if args.tune: model.eval() model.module.fuse_model() from lpot import Quantization, common quantizer = Quantization("./conf.yaml") quantizer.model = common.Model(model) q_model = quantizer() q_model.save(args.tuned_checkpoint) exit(0) if args.benchmark: model.eval() model.module.fuse_model() if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) else: new_model = model validate(val_loader, new_model, criterion, args) exit(0) # Training data loading traindir = os.path.join(args.data, 'train') train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(args.input_dim), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best Acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
def main(config='config/blendcnn/mrpc/eval.json', args=None): cfg = Config(**json.load(open(config, "r"))) cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r"))) cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r"))) cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r"))) set_seeds(cfg.seed) TaskDataset = data.get_class( cfg_data.task) # task dataset class according to the task tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file, do_lower_case=True) dataset = TaskDataset( args.dataset_location, pipelines=[ data.RemoveSymbols('\\'), data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), data.AddSpecialTokensWithTruncation(cfg_data.max_len), data.TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, cfg_data.max_len) ], n_data=None) dataset = TensorDataset(*dataset.get_tensors()) # To Tensors data_iter = DataLoader(dataset, batch_size=args.batch_size, shuffle=False) model = models.BlendCNN(cfg_model, len(TaskDataset.labels)) checkpoint.load_embedding(model.embed, cfg.pretrain_file) optimizer = optim.optim4GPU(cfg_optim, model) train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, optimizer, cfg.save_dir, get_device()) def get_loss(model, batch, global_step): # make sure loss is a scalar tensor input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) loss = nn.CrossEntropyLoss()(logits, label_id) return loss def evaluate(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() #.cpu().numpy() accuracy = result.mean() return accuracy, result class Bert_DataLoader(object): def __init__(self, loader=None, model_type=None, device='cpu', batch_size=1): self.loader = loader self.model_type = model_type self.device = device self.batch_size = batch_size def __iter__(self): for batch in self.loader: batch = tuple(t.to(self.device) for t in batch) outputs = { 'output_all': (batch[0], batch[1], batch[2]), 'labels': batch[3] } yield outputs['output_all'], outputs['labels'] def benchmark(model): total_samples = 0 total_time = 0 index = 0 class RandomDataset(object): def __init__(self, size, shape): self.len = size self.input_ids = torch.randint(low=0, high=30522, size=(size, shape), dtype=torch.int64) self.segment_ids = torch.randint(low=0, high=1, size=(size, shape), dtype=torch.int64) self.input_mask = torch.randint(low=0, high=1, size=(size, shape), dtype=torch.int64) self.data = (self.input_ids, self.segment_ids, self.input_mask) def __getitem__(self, index): return (self.data[0][index], self.data[1][index], self.data[2][index]) def __len__(self): return self.len rand_loader = DataLoader(dataset=RandomDataset(size=5000, shape=128), batch_size=args.batch_size, shuffle=True) for batch in rand_loader: index += 1 tic = time.time() if os.environ.get('BLENDCNN_PROFILING') is not None: with profiler.profile(record_shapes=True) as prof: with torch.no_grad(): input_ids, segment_ids, input_mask = batch _ = model(*batch) else: with torch.no_grad( ): # evaluation without gradient calculation input_ids, segment_ids, input_mask = batch _ = model(*batch) if index > args.warmup: total_samples += batch[0].size()[0] total_time += time.time() - tic throughput = total_samples / total_time print('Latency: %.3f ms' % (1 / throughput * 1000)) print('Throughput: %.3f images/sec' % (throughput)) if os.environ.get('BLENDCNN_PROFILING') is not None: print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) def eval_func(model): results = [] # prediction results total_samples = 0 total_time = 0 index = 0 model.eval() eval_dataloader = Bert_DataLoader(loader=data_iter, batch_size=args.batch_size) for batch, label in eval_dataloader: index += 1 tic = time.time() if os.environ.get('BLENDCNN_PROFILING') is not None: with profiler.profile(record_shapes=True) as prof: with torch.no_grad(): accuracy, result = evaluate(model, (*batch, label)) else: with torch.no_grad( ): # evaluation without gradient calculation accuracy, result = evaluate(model, (*batch, label)) results.append(result) if index > args.warmup: total_samples += batch[0].size()[0] total_time += time.time() - tic total_accuracy = torch.cat(results).mean().item() throughput = total_samples / total_time print('Latency: %.3f ms' % (1 / throughput * 1000)) print('Throughput: %.3f samples/sec' % (throughput)) print('Accuracy: %.3f ' % (total_accuracy)) if os.environ.get('BLENDCNN_PROFILING') is not None: print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) return total_accuracy if cfg.mode == "train": train_loop.train(get_loss, cfg.model_file, None) # not use pretrain_file print("Training has been done properly.") elif cfg.mode == "eval": # results = train_loop.eval(evaluate, cfg.model_file) # total_accuracy = torch.cat(results).mean().item() # print(f"Accuracy: {total_accuracy}") if args.tune: import lpot from lpot import common # lpot tune model.load_state_dict(torch.load(args.input_model)) eval_dataloader = Bert_DataLoader(loader=data_iter, batch_size=args.batch_size) quantizer = lpot.Quantization(args.tuned_yaml) quantizer.model = common.Model(model) quantizer.calib_dataloader = eval_dataloader quantizer.eval_func = eval_func q_model = quantizer() q_model.save(args.tuned_checkpoint) elif args.int8: from lpot.utils.pytorch import load int8_model = load( os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) print(int8_model) if args.accuracy_only: eval_func(int8_model) elif args.benchmark: benchmark(int8_model) else: model.load_state_dict(torch.load(args.input_model)) print(model) if args.accuracy_only: eval_func(model) elif args.benchmark: benchmark(model)
def main(): # init the args args = Options().parse() args.cuda = not args.no_cuda and torch.cuda.is_available() print(args) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # init dataloader interp = PIL.Image.BILINEAR if args.crop_size < 320 else PIL.Image.BICUBIC base_size = args.base_size if args.base_size is not None else int( 1.0 * args.crop_size / 0.875) transform_val = transforms.Compose([ ECenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) valset = ImageNetDataset(args.data, transform=transform_val, train=False) val_loader = torch.utils.data.DataLoader( valset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True if args.cuda else False) # assert args.model in torch.hub.list('zhanghang1989/ResNeSt', force_reload=True) functions = inspect.getmembers(module, inspect.isfunction) model_list = [f[0] for f in functions] assert args.model in model_list get_model = importlib.import_module('resnest.torch') net = getattr(get_model, args.model) # model = torch.hub.load('zhanghang1989/ResNeSt', args.model, pretrained=True) model = net(pretrained=True) # print(model) if args.cuda: model.cuda() # Please use CUDA_VISIBLE_DEVICES to control the number of gpus model = nn.DataParallel(model) # checkpoint if args.verify: if os.path.isfile(args.verify): print("=> loading checkpoint '{}'".format(args.verify)) model.module.load_state_dict(torch.load(args.verify)) else: raise RuntimeError("=> no verify checkpoint found at '{}'".format( args.verify)) elif args.resume is not None: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.module.load_state_dict(checkpoint['state_dict']) else: raise RuntimeError("=> no resume checkpoint found at '{}'".format( args.resume)) model.eval() model.fuse_model() if args.tune: from lpot import Quantization, common quantizer = Quantization("./conf.yaml") quantizer.model = common.Model(model) q_model = quantizer() q_model.save(args.tuned_checkpoint) exit(0) if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) else: new_model = model top1 = AverageMeter() top5 = AverageMeter() batch_time = AverageMeter() iterations = args.iterations warmup = args.warmup_iterations tbar = tqdm(val_loader, desc='\r') for batch_idx, (data, target) in enumerate(tbar): if iterations == 0 or batch_idx < iterations + warmup: if batch_idx >= warmup: end = time.time() if args.cuda: data, target = data.cuda(), target.cuda() with torch.no_grad(): output = new_model(data) if batch_idx >= warmup: batch_time.update(time.time() - end) acc1, acc5 = accuracy(output, target, topk=(1, 5)) top1.update(acc1[0], data.size(0)) top5.update(acc5[0], data.size(0)) tbar.set_description('Top1: %.3f | Top5: %.3f' % (top1.avg, top5.avg)) elif batch_idx == iterations + warmup: break print('Batch size = %d' % args.batch_size) if args.batch_size == 1: print('Latency: %.3f ms' % (batch_time.avg * 1000)) print('Throughput: %.3f images/sec' % (args.batch_size / batch_time.avg)) print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}'.format( top1=(top1.avg / 100), top5=(top5.avg / 100)))
def main_worker(gpu, ngpus_per_node, args): global best_acc1 print("Use CPU: {} for training".format(gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True, quantize=False) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallelCPU(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return if args.tune: def training_func_for_lpot(model): epochs = 8 iters = 30 optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) for nepoch in range(epochs): model.train() cnt = 0 for image, target in train_loader: print('.', end='') cnt += 1 output = model(image) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() if cnt >= iters: break if nepoch > 3: # Freeze quantizer parameters model.apply(torch.quantization.disable_observer) if nepoch > 2: # Freeze batch norm mean and variance estimates model.apply(torch.nn.intrinsic.qat.freeze_bn_stats) return model.module.fuse_model() from lpot import Quantization, common quantizer = Quantization(args.config) quantizer.model = common.Model(model) quantizer.q_func = training_func_for_lpot quantizer.eval_dataloader = val_loader q_model = quantizer() q_model.save(args.tuned_checkpoint) return if args.benchmark: model.eval() model.module.fuse_model() if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) else: new_model = model validate(val_loader, new_model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
path_imgrec=dataset, label_width=1, preprocess_threads=data_nthreads, batch_size=batch_size, data_shape=data_shape, label_name=label_name, rand_crop=False, rand_mirror=False, shuffle=args.shuffle_dataset, shuffle_chunk_seed=args.shuffle_chunk_seed, seed=args.shuffle_seed, dtype=data_layer_type, ctx=args.ctx, **combine_mean_std) quantizer = Quantization("./cnn.yaml") quantizer.model = common.Model(fp32_model) quantizer.calib_dataloader = calib_data quantizer.eval_dataloader = data q_model = quantizer() q_model.save(args.output_graph) sys.exit() if args.accuracy_only: symbol_file = args.symbol_file param_file = args.param_file sym, arg_params, aux_params = load_model(symbol_file, param_file) score(sym, arg_params, aux_params, data, [ctx], label_name,