def test_quantization_saved(self):
        from lpot.utils.pytorch import load

        model = copy.deepcopy(self.model)

        for fake_yaml in ['qat_yaml.yaml', 'ptq_yaml.yaml']:
            if fake_yaml == 'ptq_yaml.yaml':
                model.eval().fuse_model()
            quantizer = Quantization(fake_yaml)
            dataset = quantizer.dataset('dummy', (100, 3, 256, 256),
                                        label=True)
            quantizer.model = common.Model(model)
            quantizer.calib_dataloader = common.DataLoader(dataset)
            quantizer.eval_dataloader = common.DataLoader(dataset)
            if fake_yaml == 'qat_yaml.yaml':
                quantizer.q_func = q_func
            q_model = quantizer()
            q_model.save('./saved')
            # Load configure and weights by lpot.utils
            saved_model = load("./saved", model)
            eval_func(saved_model)
        from lpot import Benchmark
        evaluator = Benchmark('ptq_yaml.yaml')
        # Load configure and weights by lpot.model
        evaluator.model = common.Model(model)
        evaluator.b_dataloader = common.DataLoader(dataset)
        results = evaluator()
        evaluator.model = common.Model(model)
        fp32_results = evaluator()
        self.assertTrue(
            (fp32_results['accuracy'][0] - results['accuracy'][0]) < 0.01)
 def test_tuning_ipex(self):
     from lpot import Quantization
     model = torchvision.models.resnet18()
     model = MODELS['pytorch_ipex'](model)
     quantizer = Quantization('ipex_yaml.yaml')
     dataset = quantizer.dataset('dummy', (100, 3, 256, 256), label=True)
     quantizer.model = common.Model(model)
     quantizer.calib_dataloader = common.DataLoader(dataset)
     quantizer.eval_dataloader = common.DataLoader(dataset)
     lpot_model = quantizer()
     lpot_model.save("./saved")
     new_model = MODELS['pytorch_ipex'](model.model, {
         "workspace_path": "./saved"
     })
     new_model.model.to(ipex.DEVICE)
     try:
         script_model = torch.jit.script(new_model.model)
     except:
         script_model = torch.jit.trace(
             new_model.model,
             torch.randn(10, 3, 224, 224).to(ipex.DEVICE))
     from lpot import Benchmark
     evaluator = Benchmark('ipex_yaml.yaml')
     evaluator.model = common.Model(script_model)
     evaluator.b_dataloader = common.DataLoader(dataset)
     results = evaluator()
Exemple #3
0
    def test_mlp_model_quantization(self):
        """
        Use MLP model to test minmax calibration and built-in evaluate function.
        """
        for shape in [(500, 1000),]:
            arg_shapes, label_shape, _ = self.mlp_model.infer_shape(data=shape)

            mod = mx.mod.Module(symbol=self.mlp_model, context=mx.current_context())
            mod.bind(for_training=False,
                     data_shapes=[('data', arg_shapes[0])],
                     label_shapes=[('softmax_label', label_shape[0])])
            mod.init_params()

            arg_params, aux_params = mod.get_params()
            data = mx.nd.random.uniform(low=self.data_low, high=self.data_high,
                                        shape=shape).astype('float32')
            labels = mx.nd.ones([shape[0], ])
            calib_data = mx.io.NDArrayIter(data=data, label=labels, batch_size=shape[0])

            fp32_model = (self.mlp_model, arg_params, aux_params)
            self.quantizer_1.model = common.Model(fp32_model)
            self.quantizer_1.calib_dataloader = calib_data
            self.quantizer_1.eval_dataloader = calib_data
            qmodel = self.quantizer_1()
            self.assertIsInstance(qmodel.model[0], mx.symbol.Symbol)
Exemple #4
0
    def test_gluon_model(self):
        """
        Use gluon model to test gluon related functions in mxnet adaptor.
        """
        # create gluon model
        net = nn.HybridSequential()
        net.add(nn.Dense(128, activation="relu"))
        net.add(nn.Dense(64, activation="relu"))
        net.add(nn.Dense(10))
        net.initialize()

        class Quant_dataloader():
            def __init__(self, dataset, batch_size=1):
                self.dataset = dataset
                self.batch_size = batch_size

            def __iter__(self):
                for data, label in self.dataset:
                    yield data, label

            def __getitem__(self):
                pass

        valid_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=False)
        q_dataloader = Quant_dataloader(valid_dataset)
        self.quantizer_1.model = common.Model(net)
        self.quantizer_1.calib_dataloader = q_dataloader
        self.quantizer_1.eval_func = eval_func
        qmodel = self.quantizer_1()
        self.assertIsInstance(qmodel.model, mx.gluon.HybridBlock)
Exemple #5
0
def main():

    import lpot
    from lpot import common
    quantizer = lpot.Quantization('./conf.yaml')
    quantizer.model = common.Model("./mobilenet_v1_1.0_224_frozen.pb")
    quantized_model = quantizer()
Exemple #6
0
def main():
    class CalibrationDL():
        def __init__(self):
            path = os.path.abspath(
                os.path.expanduser('./brats_cal_images_list.txt'))
            with open(path, 'r') as f:
                self.preprocess_files = [line.rstrip() for line in f]

            self.loaded_files = {}
            self.batch_size = 1

        def __getitem__(self, sample_id):
            file_name = self.preprocess_files[sample_id]
            print("Loading file {:}".format(file_name))
            with open(
                    os.path.join('build/calib_preprocess/',
                                 "{:}.pkl".format(file_name)), "rb") as f:
                self.loaded_files[sample_id] = pickle.load(f)[0]
            return torch.from_numpy(
                self.loaded_files[sample_id][np.newaxis, ...]).float(), None

        def __len__(self):
            self.count = len(self.preprocess_files)
            return self.count

    args = get_args()
    assert args.backend == "pytorch"
    model_path = os.path.join(args.model_dir, "plans.pkl")
    assert os.path.isfile(
        model_path), "Cannot find the model file {:}!".format(model_path)
    trainer, params = load_model_and_checkpoint_files(
        args.model_dir,
        folds=1,
        fp16=False,
        checkpoint_name='model_final_checkpoint')
    trainer.load_checkpoint_ram(params[0], False)
    model = trainer.network

    if args.tune:
        quantizer = Quantization('conf.yaml')
        quantizer.model = common.Model(model)
        quantizer.eval_func = eval_func
        calib_dl = CalibrationDL()
        quantizer.calib_dataloader = calib_dl
        q_model = quantizer()
        q_model.save('./lpot_workspace')
        exit(0)

    if args.benchmark:
        model.eval()
        if args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser('./lpot_workspace')), model)
        else:
            new_model = model
        eval_func(new_model)
Exemple #7
0
def main():

    import lpot
    quantizer = lpot.Quantization('./conf.yaml')
    dataset = quantizer.dataset('dummy', shape=(100, 100, 100, 3), label=True)
    quantizer.model = common.Model(
        './model/public/rfcn-resnet101-coco-tf/model/public/rfcn-resnet101-coco-tf/rfcn_resnet101_coco_2018_01_28/'
    )
    quantizer.calib_dataloader = common.DataLoader(dataset)
    quantized_model = quantizer()
Exemple #8
0
def main():

    import lpot
    from lpot import common
    quantizer = lpot.Quantization('./conf.yaml')
    quantizer.model = common.Model("./mobilenet_v1_1.0_224_frozen.pb")
    quantized_model = quantizer()

     # Optional, run benchmark 
    from lpot import Benchmark
    evaluator = Benchmark('./conf.yaml')
    evaluator.model = common.Model(quantized_model)
    results = evaluator()
    batch_size = 1
    for mode, result in results.items():
       acc, batch_size, result_list = result
       latency = np.array(result_list).mean() / batch_size

       print('Accuracy is {:.3f}'.format(acc))
       print('Latency: {:.3f} ms'.format(latency * 1000))
 def test_tensor_dump(self):
     model = copy.deepcopy(self.lpot_model)
     model.model.eval().fuse_model()
     quantizer = Quantization('dump_yaml.yaml')
     dataset = quantizer.dataset('dummy', (100, 3, 256, 256), label=True)
     quantizer.model = common.Model(model.model)
     quantizer.calib_dataloader = common.DataLoader(dataset)
     quantizer.eval_func = eval_func
     quantizer()
     self.assertTrue(
         True if os.path.exists('runs/eval/baseline_acc0.0') else False)
     quantizer.eval_dataloader = common.DataLoader(dataset)
     quantizer()
     self.assertTrue(
         True if os.path.exists('runs/eval/baseline_acc0.0') else False)
Exemple #10
0
 def test_quantizate(self):
     from lpot import Quantization, common
     for fake_yaml in ["static_yaml.yaml", "dynamic_yaml.yaml"]:
         quantizer = Quantization(fake_yaml)
         dataset = quantizer.dataset("dummy", (100, 3, 224, 224),
                                     low=0.,
                                     high=1.,
                                     label=True)
         quantizer.calib_dataloader = common.DataLoader(dataset)
         quantizer.eval_dataloader = common.DataLoader(dataset)
         quantizer.model = common.Model(self.rn50_model)
         q_model = quantizer()
         eval_func(q_model)
     for fake_yaml in ["non_MSE_yaml.yaml"]:
         quantizer = Quantization(fake_yaml)
         dataset = quantizer.dataset("dummy", (100, 3, 224, 224),
                                     low=0.,
                                     high=1.,
                                     label=True)
         quantizer.calib_dataloader = common.DataLoader(dataset)
         quantizer.eval_dataloader = common.DataLoader(dataset)
         quantizer.model = common.Model(self.mb_v2_model)
         q_model = quantizer()
         eval_func(q_model)
Exemple #11
0
  def run(self):
      """ This is lpot function include tuning and benchmark option """

      if self.args.tune:
          from lpot import Quantization, common
          quantizer = Quantization(self.args.config)
          quantizer.model = common.Model(self.args.input_graph)
          q_model = quantizer()
          q_model.save(self.args.output_graph)

      if self.args.benchmark:
          from lpot import Benchmark, common
          evaluator = Benchmark(self.args.config)
          evaluator.model = common.Model(self.args.input_graph)
          results = evaluator()
          for mode, result in results.items():
              acc, batch_size, result_list = result
              latency = np.array(result_list).mean() / batch_size

              print('\n{} mode benchmark result:'.format(mode))
              print('Accuracy is {:.3f}'.format(acc))
              print('Batch size = {}'.format(batch_size))
              print('Latency: {:.3f} ms'.format(latency * 1000))
              print('Throughput: {:.3f} images/sec'.format(1./ latency))
Exemple #12
0
def main():

    import lpot
    from lpot import common
    quantizer = lpot.Quantization('./conf.yaml')

    # Get graph from slim checkpoint
    from tf_slim.nets import inception
    model_func = inception.inception_v1
    arg_scope = inception.inception_v1_arg_scope()
    kwargs = {'num_classes': 1001}
    inputs_shape = [None, 224, 224, 3]
    images = tf.compat.v1.placeholder(name='input', \
    dtype=tf.float32, shape=inputs_shape)

    # Do quantization
    quantizer.model = common.Model('./inception_v1.ckpt')
    quantized_model = quantizer()
Exemple #13
0
def tune_model(
    input_graph: str,
    output_graph: str,
    config: str,
    framework: str,
) -> None:
    """Execute tuning."""
    from lpot import Quantization, common

    if framework == "onnxrt":
        import onnx

        input_graph = onnx.load(input_graph)

    quantizer = Quantization(config)
    quantizer.model = common.Model(input_graph)
    quantized_model = quantizer()
    quantized_model.save(output_graph)
Exemple #14
0
def main(_):
    graph = load_graph(FLAGS.input_graph)
    if FLAGS.mode == 'tune':
        from lpot import Quantization, common
        quantizer = Quantization(FLAGS.config)
        ds = Dataset(FLAGS.inputs_file, FLAGS.reference_file, FLAGS.vocab_file)
        quantizer.calib_dataloader = common.DataLoader(ds, collate_fn=collate_fn, \
                                                 batch_size=FLAGS.batch_size)
        quantizer.model = common.Model(graph)
        quantizer.eval_func = eval_func
        q_model = quantizer()
        try:
            q_model.save(FLAGS.output_model)
        except Exception as e:
            print("Failed to save model due to {}".format(str(e)))
    elif FLAGS.mode == 'benchmark':
        eval_func(graph, FLAGS.iters)
    elif FLAGS.mode == 'accuracy':
        eval_func(graph, -1)
Exemple #15
0
def benchmark_model(
    input_graph: str,
    config: str,
    benchmark_mode: str,
    framework: str,
    datatype: str = "",
) -> List[Dict[str, Any]]:
    """Execute benchmark."""
    from lpot import Benchmark, common

    benchmark_results = []

    if framework == "onnxrt":
        import onnx

        input_graph = onnx.load(input_graph)

    evaluator = Benchmark(config)
    evaluator.model = common.Model(input_graph)
    results = evaluator()
    for mode, result in results.items():
        if benchmark_mode == mode:
            log.info(f"Mode: {mode}")
            acc, batch_size, result_list = result
            latency = (sum(result_list) / len(result_list)) / batch_size
            log.info(f"Batch size: {batch_size}")
            if mode == "accuracy":
                log.info(f"Accuracy: {acc:.3f}")
            elif mode == "performance":
                log.info(f"Latency: {latency * 1000:.3f} ms")
                log.info(f"Throughput: {1. / latency:.3f} images/sec")

            benchmark_results.append(
                {
                    "precision": datatype,
                    "mode": mode,
                    "batch_size": batch_size,
                    "accuracy": acc,
                    "latency": latency * 1000,
                    "throughput": 1.0 / latency,
                },
            )
    return benchmark_results
Exemple #16
0
    def test_conv_model_quantization(self):
        """
        Use Conv model to test KL calibration and user specific evaluate function.
        """
        for shape in [(500, 3, 224, 224),]:
            arg_shapes, _, _ = self.conv_model.infer_shape(data=shape)

            mod = mx.mod.Module(symbol=self.conv_model, context=mx.current_context())
            mod.bind(for_training=False,
                     data_shapes=[('data', arg_shapes[0])])
            mod.init_params()

            arg_params, aux_params = mod.get_params()
            data = mx.nd.random.uniform(low=self.data_low, high=self.data_high,
                                        shape=shape).astype('float32')
            calib_data = mx.io.NDArrayIter(data=data, batch_size=shape[0])

            fp32_model = (self.conv_model, arg_params, aux_params)
            self.quantizer_2.model = common.Model(fp32_model)
            self.quantizer_2.calib_dataloader = calib_data
            self.quantizer_2.eval_dataloader = calib_data
            self.quantizer_2.eval_func = eval_func
            qmodel = self.quantizer_2()
            # test inspected_tensor
            inspect_tensor = self.quantizer_2.strategy.adaptor.inspect_tensor
            self.quantizer_2.model = fp32_model
            inspected_tensor = inspect_tensor(self.quantizer_2.model, calib_data,
                                              op_list=[('sg_mkldnn_conv_bn_act_0_output', 'CONV'),
                                                       ('data', 'input')],
                                              iteration_list=[0, 2, 4])
            inspected_qtensor = inspect_tensor(qmodel, calib_data,
                                               op_list=[('quantized_sg_mkldnn_conv_bn_act_0_output', 'CONV')],
                                               iteration_list=[0])

            self.assertNotEqual(len(inspected_tensor), 0)
            self.assertNotEqual(len(inspected_qtensor), 0)
            self.assertIsInstance(qmodel.model[0], mx.symbol.Symbol)
Exemple #17
0
    def test_pruning(self):
        from lpot import Pruning, common
        prune = Pruning('fake.yaml')

        dummy_dataset = PyTorchDummyDataset([tuple([100, 3, 256, 256])])
        dummy_dataloader = PyTorchDataLoader(dummy_dataset)

        def training_func_for_lpot(model):
            epochs = 16
            iters = 30
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
            for nepoch in range(epochs):
                model.train()
                cnt = 0
                prune.on_epoch_begin(nepoch)
                for image, target in dummy_dataloader:
                    prune.on_batch_begin(cnt)
                    print('.', end='')
                    cnt += 1
                    output = model(image)
                    loss = criterion(output, target)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    prune.on_batch_end()
                    if cnt >= iters:
                        break
                prune.on_epoch_end()

        dummy_dataset = PyTorchDummyDataset(tuple([100, 3, 256, 256]),
                                            label=True)
        dummy_dataloader = PyTorchDataLoader(dummy_dataset)
        prune.model = common.Model(self.model)
        prune.q_func = training_func_for_lpot
        prune.eval_dataloader = dummy_dataloader
        _ = prune()
Exemple #18
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help=
        "The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets."
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument(
        '--version_2_with_negative',
        action='store_true',
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json output file."
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument("--do_calibration",
                        action='store_true',
                        help="Whether to do calibration.")
    parser.add_argument("--do_int8_inference",
                        action='store_true',
                        help="Whether to run int8 inference.")
    parser.add_argument("--do_fp32_inference",
                        action='store_true',
                        help="Whether to run fp32 inference.")
    parser.add_argument("--mkldnn_eval",
                        action='store_true',
                        help="evaluation with MKLDNN")
    parser.add_argument(
        "--tune",
        action='store_true',
        help="run Low Precision Optimization Tool to tune int8 acc.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="SQuAD task")
    parser.add_argument("--warmup",
                        type=int,
                        default=5,
                        help="warmup for performance")
    parser.add_argument('-i',
                        "--iter",
                        default=0,
                        type=int,
                        help='For accuracy measurement only.')
    parser.add_argument('--config',
                        type=str,
                        default='conf.yaml',
                        help="yaml config file")
    parser.add_argument('--benchmark',
                        dest='benchmark',
                        action='store_true',
                        help='run benchmark')
    parser.add_argument('-r',
                        "--accuracy_only",
                        dest='accuracy_only',
                        action='store_true',
                        help='For accuracy measurement only.')
    parser.add_argument(
        "--tuned_checkpoint",
        default='./saved_results',
        type=str,
        metavar='PATH',
        help=
        'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)'
    )
    parser.add_argument('--int8',
                        dest='int8',
                        action='store_true',
                        help='run benchmark')

    args = parser.parse_args()

    args.predict_file = os.path.join(
        args.output_dir, 'predictions_{}_{}.txt'.format(
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length)))

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    mix_qkv = False
    if args.do_calibration or args.do_int8_inference or args.tune:
        mix_qkv = True

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        mix_qkv=mix_qkv,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, 'einsum')
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir,
                                            force_download=True,
                                            mix_qkv=mix_qkv)
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce model loading logs

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            if args.mkldnn_eval or args.do_fp32_inference:
                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True)
                model.to(args.device)

                # Evaluate
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step)
                result = dict(
                    (k + ('_{}'.format(global_step) if global_step else ''), v)
                    for k, v in result.items())
                results.update(result)

            if args.tune:

                def eval_func_for_lpot(model):
                    result, _ = evaluate(args, model, tokenizer)
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                    bert_task_acc_keys = [
                        'best_f1', 'f1', 'mcc', 'spearmanr', 'acc'
                    ]
                    for key in bert_task_acc_keys:
                        if key in result.keys():
                            logger.info("Finally Eval {}:{}".format(
                                key, result[key]))
                            acc = result[key]
                            break
                    return acc

                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True,
                                                    mix_qkv=True)
                model.to(args.device)
                dataset = load_and_cache_examples(args,
                                                  tokenizer,
                                                  evaluate=True,
                                                  output_examples=False)
                args.eval_batch_size = args.per_gpu_eval_batch_size * max(
                    1, args.n_gpu)
                eval_task = "squad"
                from lpot import Quantization, common
                quantizer = Quantization(args.config)
                dataset = quantizer.dataset('bert',
                                            dataset=dataset,
                                            task=eval_task,
                                            model_type=args.model_type)
                quantizer.model = common.Model(model)
                quantizer.calib_dataloader = common.DataLoader(
                    dataset, batch_size=args.eval_batch_size)
                quantizer.eval_func = eval_func_for_lpot
                q_model = quantizer()
                q_model.save(args.tuned_checkpoint)
                exit(0)

            if args.benchmark or args.accuracy_only:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                if args.int8:
                    from lpot.utils.pytorch import load
                    new_model = load(
                        os.path.abspath(
                            os.path.expanduser(args.tuned_checkpoint)), model)
                else:
                    new_model = model
                result, _ = evaluate(args,
                                     new_model,
                                     tokenizer,
                                     prefix=global_step)
                exit(0)

            if args.do_calibration:
                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True,
                                                    mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                propagate_qconfig_(model)
                add_observer_(model)
                # Evaluate
                evaluate(args,
                         model,
                         tokenizer,
                         prefix=global_step,
                         calibration=True)
                convert(model, inplace=True)
                quantized_model_path = "squad" + str(
                    global_step) + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    os.makedirs(quantized_model_path)
                model.save_pretrained(quantized_model_path)
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step)
                result = dict(
                    (k + ('_{}'.format(global_step) if global_step else ''), v)
                    for k, v in result.items())
                results.update(result)
            if args.do_int8_inference:
                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True,
                                                    mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                propagate_qconfig_(model)
                add_observer_(model)
                convert(model, inplace=True)
                quantized_model_path = "squad" + str(
                    global_step) + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    logger.info("Please run calibration first!")
                    return
                model_bin_file = os.path.join(quantized_model_path,
                                              "pytorch_model.bin")
                state_dict = torch.load(model_bin_file)
                model.load_state_dict(state_dict)
                print(model)
                with torch.autograd.profiler.profile() as prof:
                    result, _ = evaluate(args,
                                         model,
                                         tokenizer,
                                         prefix=global_step)
                print(prof.key_averages().table(sort_by="cpu_time_total"))
                result = dict(
                    (k + ('_{}'.format(global_step) if global_step else ''), v)
                    for k, v in result.items())
                results.update(result)
    logger.info("Results: {}".format(results))

    return results
Exemple #19
0
    parser.add_argument(
        '--tune',
        action='store_true', \
        default=False,
        help="whether quantize the model"
    )
    parser.add_argument('--config', type=str, help="config yaml path")
    parser.add_argument('--output_model', type=str, help="output model path")

    args = parser.parse_args()

    model = onnx.load(args.model_path)
    if args.benchmark:
        from lpot import Benchmark, common
        evaluator = Benchmark(args.config)
        evaluator.model = common.Model(model)
        results = evaluator()
        for mode, result in results.items():
            acc, batch_size, result_list = result
            latency = np.array(result_list).mean() / batch_size

            print('\n{} mode benchmark result:'.format(mode))
            print('Accuracy is {:.3f}'.format(acc))
            print('Batch size = {}'.format(batch_size))
            print('Latency: {:.3f} ms'.format(latency * 1000))
            print('Throughput: {:.3f} images/sec'.format(batch_size * 1. /
                                                         latency))

    if args.tune:
        from lpot import Quantization, common
Exemple #20
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    #args.gpu = gpu
    #affinity = subprocess.check_output("lscpu | grep 'NUMA node[0-9]' | awk '{ print $4 }' | awk -F',' '{ print $1 }'", shell=True)
    #os.environ['OMP_NUM_THREADS'] = '28'
    #os.environ['KMP_AFFINITY'] = 'proclist=[{}],granularity=thread,explicit'.format(affinity.splitlines()[gpu].decode('utf-8'))
    #print (os.environ['KMP_AFFINITY'])

    #if args.gpu is not None:
    #    print("Use GPU: {} for training".format(args.gpu))
    print("Use CPU: {} for training".format(gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        if args.ipex:
            model = models.__dict__[args.arch](pretrained=True)
        else:
            model = quantize_models.__dict__[args.arch](pretrained=True, quantize=False)
    else:
        print("=> creating model '{}'".format(args.arch))
        if args.ipex:
            model = models.__dict__[args.arch]()
        else:
            model = quantize_models.__dict__[args.arch]()

    if not torch.cuda.is_available():
        print('using CPU...')
    elif args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            #model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallelCPU(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    #criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    #cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)

    if args.tune:
        from lpot import Quantization, common
        if args.ipex:
            quantizer = Quantization("./conf_ipex.yaml")
        else:
            model.eval()
            model.fuse_model()
            quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)
        return

    if args.benchmark or args.accuracy_only:
        model.eval()
        ipex_config_path = None
        if args.int8:
            if args.ipex:
                # TODO: It will remove when IPEX spport to save script model.
                model.to(ipex.DEVICE)
                try:
                    new_model = torch.jit.script(model)
                except:
                    new_model = torch.jit.trace(model, torch.randn(1, 3, 224, 224).to(ipex.DEVICE))
                ipex_config_path = os.path.join(os.path.expanduser(args.tuned_checkpoint),
                                                "best_configure.json")
            else:
                model.fuse_model()
                from lpot.utils.pytorch import load
                new_model = load(
                    os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model)
        else:
            if args.ipex:
                # TODO: It will remove when IPEX spport to save script model.
                model.to(ipex.DEVICE)
                try:
                    new_model = torch.jit.script(model)
                except:
                    new_model = torch.jit.trace(model, torch.randn(1, 3, 224, 224).to(ipex.DEVICE))
            else:
                model.fuse_model()
                new_model = model
        validate(val_loader, new_model, criterion, args, ipex_config_path)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)
Exemple #21
0
def main_worker(gpu, args):
    global best_acc1
    print("Use CPU: {} for training".format(gpu))

    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True, quantize=False)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    if args.prune:
        from lpot import Pruning, common
        prune = Pruning(args.config)

        def training_func_for_lpot(model):
            epochs = 16
            iters = 30
            optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
            for nepoch in range(epochs):
                model.train()
                cnt = 0
                prune.on_epoch_begin(nepoch)
                for image, target in train_loader:
                    prune.on_batch_begin(cnt)
                    print('.', end='')
                    cnt += 1
                    output = model(image)
                    loss = criterion(output, target)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    prune.on_batch_end()
                    if cnt >= iters:
                        break
                prune.on_epoch_end()
                if nepoch > 3:
                    # Freeze quantizer parameters
                    model.apply(torch.quantization.disable_observer)
                if nepoch > 2:
                    # Freeze batch norm mean and variance estimates
                    model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)
            validate(val_loader, model, criterion, args)

            return

        prune.model = common.Model(model)
        prune.eval_dataloader = val_loader
        prune.q_func = training_func_for_lpot
        q_model = prune()
        return
Exemple #22
0
    EM_acc = evaluate(model)
    return EM_acc

if __name__ == '__main__':
    if only_calibration:
        try:
            calibration(net,
                        num_calib_batches,
                        quantized_dtype,
                        calib_mode)
        except AttributeError:
            nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
            warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
    elif not only_predict:
        train()
        evaluate()
    elif args.tune:
        # lpot auto-tuning
        dev_dataloader = gen_dataset()
        from lpot import Quantization, common
        quantizer = Quantization("./bert.yaml")
        quantizer.model = common.Model(net)
        quantizer.calib_dataloader = dev_dataloader
        quantizer.eval_dataloader = dev_dataloader
        quantizer.eval_func = eval_func
        q_model = quantizer()
        q_model.save(args.output_dir)
    elif model_parameters or deploy:
        evaluate()

Exemple #23
0
    logger.info("speed is %.2f samples/s" % speed)

    return acc


if __name__ == '__main__':
    if only_calibration:
        try:
            calibration(model, dev_data_list, num_calib_batches,
                        quantized_dtype, calib_mode)
        except AttributeError:
            nlp.utils.version.check_version('1.7.0',
                                            warning_only=True,
                                            library=mx)
            warnings.warn(
                'INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
    elif args.tune:
        # lpot auto-tuning
        if only_inference:
            calib_data = dev_data_list[0][1]
            from lpot import Quantization, common
            quantizer = Quantization("./bert.yaml")
            quantizer.model = common.Model(model)
            quantizer.calib_dataloader = calib_data
            quantizer.eval_dataloader = calib_data
            quantizer.eval_func = test_func
            q_model = quantizer()
            q_model.save(args.output_dir)
    else:
        train(task.metrics)
Exemple #24
0
        pass

    def reset(self):
        self.pred_list = []
        self.label_list = []
        self.samples = 0
        pass

    def result(self):
        correct_num = np.sum(
            np.array(self.pred_list) == np.array(self.label_list))
        return correct_num / self.samples


# Quantize with customized dataloader and metric
quantizer = lpot.Quantization('./conf.yaml')
dataset = Dataset()
quantizer.metric = common.Metric(MyMetric, 'hello_metric')
quantizer.calib_dataloader = common.DataLoader(dataset, batch_size=1)
quantizer.eval_dataloader = common.DataLoader(dataset, batch_size=1)
quantizer.model = common.Model('../models/simple_model')
q_model = quantizer()

# Optional, run quantized model
import tensorflow as tf
with tf.compat.v1.Graph().as_default(), tf.compat.v1.Session() as sess:
    tf.compat.v1.import_graph_def(q_model.as_graph_def(), name='')
    styled_image = sess.run(['output:0'],
                            feed_dict={'input:0': dataset.test_images})
    print("Inference is done.")
Exemple #25
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list;"
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument("--mkldnn_eval",
                        action='store_true',
                        help="evaluation with MKLDNN")
    parser.add_argument("--mkldnn_train",
                        action='store_true',
                        help="training with MKLDNN")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument("--do_fp32_inference",
                        action='store_true',
                        help="Whether to run fp32 inference.")
    parser.add_argument("--do_calibration",
                        action='store_true',
                        help="Whether to do calibration.")
    parser.add_argument("--do_int8_inference",
                        action='store_true',
                        help="Whether to run int8 inference.")
    parser.add_argument("--do_bf16",
                        action='store_true',
                        help="run bf16 evaluation / training.")
    parser.add_argument(
        "--tune",
        action='store_true',
        help="run Low Precision Optimization Tool to tune int8 acc.")
    parser.add_argument("--warmup",
                        type=int,
                        default=2,
                        help="warmup for performance")
    parser.add_argument('-i',
                        "--iter",
                        default=0,
                        type=int,
                        help='For accuracy measurement only.')
    parser.add_argument('--config',
                        default='conf.yaml',
                        type=str,
                        help='yaml config file path')
    parser.add_argument('--benchmark',
                        dest='benchmark',
                        action='store_true',
                        help='run benchmark')
    parser.add_argument('-r',
                        "--accuracy_only",
                        dest='accuracy_only',
                        action='store_true',
                        help='For accuracy measurement only.')
    parser.add_argument(
        "--tuned_checkpoint",
        default='./saved_results',
        type=str,
        metavar='PATH',
        help=
        'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)'
    )
    parser.add_argument('--int8',
                        dest='int8',
                        action='store_true',
                        help='run benchmark')

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)
    mix_qkv = False
    if args.do_calibration or args.do_int8_inference or args.tune:
        mix_qkv = True
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        mix_qkv=mix_qkv,
        bf16=args.do_bf16,
        mkldnn_train=args.mkldnn_train,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""

            logger.info("Evaluate:" + args.task_name)
            if args.mkldnn_eval or args.do_fp32_inference or args.do_bf16:
                model = model_class.from_pretrained(checkpoint)
                model.to(args.device)
                result = evaluate(args, model, tokenizer, prefix=prefix)
                result = dict((k + '_{}'.format(global_step), v)
                              for k, v in result.items())
                results.update(result)

            if args.tune:

                def eval_func_for_lpot(model):
                    result, perf = evaluate(args,
                                            model,
                                            tokenizer,
                                            prefix=prefix)
                    bert_task_acc_keys = [
                        'acc_and_f1', 'f1', 'mcc', 'spearmanr', 'acc'
                    ]
                    for key in bert_task_acc_keys:
                        if key in result.keys():
                            logger.info("Finally Eval {}:{}".format(
                                key, result[key]))
                            acc = result[key]
                            break
                    return acc

                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                eval_task_names = (
                    "mnli", "mnli-mm") if args.task_name == "mnli" else (
                        args.task_name, )

                for eval_task in eval_task_names:
                    eval_dataset = load_and_cache_examples(args,
                                                           eval_task,
                                                           tokenizer,
                                                           evaluate=True)

                    args.eval_batch_size = args.per_gpu_eval_batch_size * max(
                        1, args.n_gpu)
                    # multi-gpu eval
                    if args.n_gpu > 1:
                        model = torch.nn.DataParallel(model)

                    if args.mkldnn_eval:
                        from torch.utils import mkldnn as mkldnn_utils
                        model = mkldnn_utils.to_mkldnn(model)
                        print(model)
                    from lpot import Quantization, common
                    quantizer = Quantization(args.config)
                    if eval_task != "squad":
                        eval_task = 'classifier'
                    eval_dataset = quantizer.dataset(
                        'bert',
                        dataset=eval_dataset,
                        task=eval_task,
                        model_type=args.model_type)
                    quantizer.model = common.Model(model)
                    quantizer.calib_dataloader = common.DataLoader(
                        eval_dataset, batch_size=args.eval_batch_size)
                    quantizer.eval_func = eval_func_for_lpot
                    q_model = quantizer()
                    q_model.save(args.tuned_checkpoint)
                exit(0)

            if args.benchmark or args.accuracy_only:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)

                if args.int8:
                    from lpot.utils.pytorch import load
                    new_model = load(
                        os.path.abspath(
                            os.path.expanduser(args.tuned_checkpoint)), model)
                else:
                    new_model = model
                result, _ = evaluate(args, new_model, tokenizer, prefix=prefix)
                exit(0)

            if args.do_calibration:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                fallback_layers = {}
                if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc":
                    fallback_layers = {"bert.encoder.layer.9.output.dense."}
                propagate_qconfig_(model)
                fallback_layer(model,
                               layer_name="",
                               exculde_layers=fallback_layers)
                add_observer_(model)
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step,
                                     calibration=True)
                convert(model, inplace=True)
                quantized_model_path = args.task_name + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    os.makedirs(quantized_model_path)
                model.save_pretrained(quantized_model_path)
                print(model)
                result, _ = evaluate(args, model, tokenizer, prefix=prefix)
            if args.do_int8_inference:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                fallback_layers = {}
                if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc":
                    fallback_layers = {"bert.encoder.layer.9.output.dense."}
                propagate_qconfig_(model)
                fallback_layer(model,
                               layer_name="",
                               exculde_layers=fallback_layers)
                add_observer_(model)
                convert(model, inplace=True)
                quantized_model_path = args.task_name + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    logger.error(
                        "please do calibrantion befor run int8 inference")
                    return
                prepare(model, inplace=True)
                convert(model, inplace=True)
                model_bin_file = os.path.join(quantized_model_path,
                                              "pytorch_model.bin")
                state_dict = torch.load(model_bin_file)
                model.load_state_dict(state_dict)
                result, _ = evaluate(args, model, tokenizer, prefix=prefix)

    return results
Exemple #26
0
def main():
    global args, best_acc1
    args = parser.parse_args()
    print('args:', args)

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # Val data loading
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    val_dataset = datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(args.input_dim + 32),
            transforms.CenterCrop(args.input_dim),
            transforms.ToTensor(),
            normalize,
        ]))

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    num_classes = len(val_dataset.classes)
    print('Total classes: ', num_classes)

    # create model
    print("=> creating model '{}'".format(args.arch))
    if args.arch == 'peleenet':
        model = PeleeNet(num_classes=num_classes)
    else:
        print(
            "=> unsupported model '{}'. creating PeleeNet by default.".format(
                args.arch))
        model = PeleeNet(num_classes=num_classes)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # DataParallel will divide
        model = torch.nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    elif args.pretrained:
        if os.path.isfile(args.weights):
            checkpoint = torch.load(args.weights,
                                    map_location=torch.device('cpu'))
            model.load_state_dict(checkpoint['state_dict'])

            print("=> loaded checkpoint '{}' (epoch {}, acc@1 {})".format(
                args.pretrained, checkpoint['epoch'], checkpoint['best_acc1']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    if args.tune:
        model.eval()
        model.module.fuse_model()
        from lpot import Quantization, common
        quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)
        exit(0)

    if args.benchmark:
        model.eval()
        model.module.fuse_model()
        if args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
                model)
        else:
            new_model = model
        validate(val_loader, new_model, criterion, args)
        exit(0)

    # Training data loading
    traindir = os.path.join(args.data, 'train')

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(args.input_dim),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best Acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            }, is_best)
Exemple #27
0
def main(config='config/blendcnn/mrpc/eval.json', args=None):
    cfg = Config(**json.load(open(config, "r")))

    cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r")))
    cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r")))
    cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r")))

    set_seeds(cfg.seed)

    TaskDataset = data.get_class(
        cfg_data.task)  # task dataset class according to the task
    tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file,
                                           do_lower_case=True)
    dataset = TaskDataset(
        args.dataset_location,
        pipelines=[
            data.RemoveSymbols('\\'),
            data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            data.AddSpecialTokensWithTruncation(cfg_data.max_len),
            data.TokenIndexing(tokenizer.convert_tokens_to_ids,
                               TaskDataset.labels, cfg_data.max_len)
        ],
        n_data=None)
    dataset = TensorDataset(*dataset.get_tensors())  # To Tensors
    data_iter = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)

    model = models.BlendCNN(cfg_model, len(TaskDataset.labels))
    checkpoint.load_embedding(model.embed, cfg.pretrain_file)

    optimizer = optim.optim4GPU(cfg_optim, model)

    train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, optimizer,
                                   cfg.save_dir, get_device())

    def get_loss(model, batch,
                 global_step):  # make sure loss is a scalar tensor
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        loss = nn.CrossEntropyLoss()(logits, label_id)
        return loss

    def evaluate(model, batch):
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float()  #.cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    class Bert_DataLoader(object):
        def __init__(self,
                     loader=None,
                     model_type=None,
                     device='cpu',
                     batch_size=1):
            self.loader = loader
            self.model_type = model_type
            self.device = device
            self.batch_size = batch_size

        def __iter__(self):
            for batch in self.loader:
                batch = tuple(t.to(self.device) for t in batch)
                outputs = {
                    'output_all': (batch[0], batch[1], batch[2]),
                    'labels': batch[3]
                }

                yield outputs['output_all'], outputs['labels']

    def benchmark(model):
        total_samples = 0
        total_time = 0
        index = 0

        class RandomDataset(object):
            def __init__(self, size, shape):
                self.len = size
                self.input_ids = torch.randint(low=0,
                                               high=30522,
                                               size=(size, shape),
                                               dtype=torch.int64)
                self.segment_ids = torch.randint(low=0,
                                                 high=1,
                                                 size=(size, shape),
                                                 dtype=torch.int64)
                self.input_mask = torch.randint(low=0,
                                                high=1,
                                                size=(size, shape),
                                                dtype=torch.int64)
                self.data = (self.input_ids, self.segment_ids, self.input_mask)

            def __getitem__(self, index):
                return (self.data[0][index], self.data[1][index],
                        self.data[2][index])

            def __len__(self):
                return self.len

        rand_loader = DataLoader(dataset=RandomDataset(size=5000, shape=128),
                                 batch_size=args.batch_size,
                                 shuffle=True)

        for batch in rand_loader:
            index += 1
            tic = time.time()
            if os.environ.get('BLENDCNN_PROFILING') is not None:
                with profiler.profile(record_shapes=True) as prof:
                    with torch.no_grad():
                        input_ids, segment_ids, input_mask = batch
                        _ = model(*batch)
            else:
                with torch.no_grad(
                ):  # evaluation without gradient calculation
                    input_ids, segment_ids, input_mask = batch
                    _ = model(*batch)
            if index > args.warmup:
                total_samples += batch[0].size()[0]
                total_time += time.time() - tic
        throughput = total_samples / total_time
        print('Latency: %.3f ms' % (1 / throughput * 1000))
        print('Throughput: %.3f images/sec' % (throughput))

        if os.environ.get('BLENDCNN_PROFILING') is not None:
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=10))

    def eval_func(model):
        results = []  # prediction results
        total_samples = 0
        total_time = 0
        index = 0
        model.eval()
        eval_dataloader = Bert_DataLoader(loader=data_iter,
                                          batch_size=args.batch_size)
        for batch, label in eval_dataloader:
            index += 1
            tic = time.time()
            if os.environ.get('BLENDCNN_PROFILING') is not None:
                with profiler.profile(record_shapes=True) as prof:
                    with torch.no_grad():
                        accuracy, result = evaluate(model, (*batch, label))
            else:
                with torch.no_grad(
                ):  # evaluation without gradient calculation
                    accuracy, result = evaluate(model, (*batch, label))
            results.append(result)
            if index > args.warmup:
                total_samples += batch[0].size()[0]
                total_time += time.time() - tic
        total_accuracy = torch.cat(results).mean().item()
        throughput = total_samples / total_time
        print('Latency: %.3f ms' % (1 / throughput * 1000))
        print('Throughput: %.3f samples/sec' % (throughput))
        print('Accuracy: %.3f ' % (total_accuracy))

        if os.environ.get('BLENDCNN_PROFILING') is not None:
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=10))
        return total_accuracy

    if cfg.mode == "train":
        train_loop.train(get_loss, cfg.model_file,
                         None)  # not use pretrain_file
        print("Training has been done properly.")

    elif cfg.mode == "eval":
        # results = train_loop.eval(evaluate, cfg.model_file)
        # total_accuracy = torch.cat(results).mean().item()
        # print(f"Accuracy: {total_accuracy}")

        if args.tune:
            import lpot
            from lpot import common
            # lpot tune
            model.load_state_dict(torch.load(args.input_model))
            eval_dataloader = Bert_DataLoader(loader=data_iter,
                                              batch_size=args.batch_size)

            quantizer = lpot.Quantization(args.tuned_yaml)
            quantizer.model = common.Model(model)
            quantizer.calib_dataloader = eval_dataloader
            quantizer.eval_func = eval_func
            q_model = quantizer()
            q_model.save(args.tuned_checkpoint)

        elif args.int8:
            from lpot.utils.pytorch import load
            int8_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
                model)
            print(int8_model)
            if args.accuracy_only:
                eval_func(int8_model)
            elif args.benchmark:
                benchmark(int8_model)

        else:
            model.load_state_dict(torch.load(args.input_model))
            print(model)
            if args.accuracy_only:
                eval_func(model)
            elif args.benchmark:
                benchmark(model)
Exemple #28
0
def main():
    # init the args
    args = Options().parse()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    print(args)
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
    # init dataloader
    interp = PIL.Image.BILINEAR if args.crop_size < 320 else PIL.Image.BICUBIC
    base_size = args.base_size if args.base_size is not None else int(
        1.0 * args.crop_size / 0.875)
    transform_val = transforms.Compose([
        ECenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    valset = ImageNetDataset(args.data, transform=transform_val, train=False)
    val_loader = torch.utils.data.DataLoader(
        valset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True if args.cuda else False)

    # assert args.model in torch.hub.list('zhanghang1989/ResNeSt', force_reload=True)
    functions = inspect.getmembers(module, inspect.isfunction)
    model_list = [f[0] for f in functions]
    assert args.model in model_list
    get_model = importlib.import_module('resnest.torch')
    net = getattr(get_model, args.model)
    # model = torch.hub.load('zhanghang1989/ResNeSt', args.model, pretrained=True)
    model = net(pretrained=True)
    # print(model)

    if args.cuda:
        model.cuda()
        # Please use CUDA_VISIBLE_DEVICES to control the number of gpus
        model = nn.DataParallel(model)

    # checkpoint
    if args.verify:
        if os.path.isfile(args.verify):
            print("=> loading checkpoint '{}'".format(args.verify))
            model.module.load_state_dict(torch.load(args.verify))
        else:
            raise RuntimeError("=> no verify checkpoint found at '{}'".format(
                args.verify))
    elif args.resume is not None:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            model.module.load_state_dict(checkpoint['state_dict'])
        else:
            raise RuntimeError("=> no resume checkpoint found at '{}'".format(
                args.resume))

    model.eval()
    model.fuse_model()

    if args.tune:
        from lpot import Quantization, common
        quantizer = Quantization("./conf.yaml")
        quantizer.model = common.Model(model)
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)
        exit(0)

    if args.int8:
        from lpot.utils.pytorch import load
        new_model = load(
            os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model)
    else:
        new_model = model

    top1 = AverageMeter()
    top5 = AverageMeter()
    batch_time = AverageMeter()
    iterations = args.iterations
    warmup = args.warmup_iterations
    tbar = tqdm(val_loader, desc='\r')
    for batch_idx, (data, target) in enumerate(tbar):
        if iterations == 0 or batch_idx < iterations + warmup:
            if batch_idx >= warmup:
                end = time.time()
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            with torch.no_grad():
                output = new_model(data)
                if batch_idx >= warmup:
                    batch_time.update(time.time() - end)
                acc1, acc5 = accuracy(output, target, topk=(1, 5))
                top1.update(acc1[0], data.size(0))
                top5.update(acc5[0], data.size(0))

            tbar.set_description('Top1: %.3f | Top5: %.3f' %
                                 (top1.avg, top5.avg))
        elif batch_idx == iterations + warmup:
            break

    print('Batch size = %d' % args.batch_size)
    if args.batch_size == 1:
        print('Latency: %.3f ms' % (batch_time.avg * 1000))
    print('Throughput: %.3f images/sec' % (args.batch_size / batch_time.avg))
    print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}'.format(
        top1=(top1.avg / 100), top5=(top5.avg / 100)))
Exemple #29
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    print("Use CPU: {} for training".format(gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True, quantize=False)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallelCPU(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    if args.tune:
        def training_func_for_lpot(model):
            epochs = 8
            iters = 30
            optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
            for nepoch in range(epochs):
                model.train()
                cnt = 0
                for image, target in train_loader:
                    print('.', end='')
                    cnt += 1
                    output = model(image)
                    loss = criterion(output, target)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    if cnt >= iters:
                        break

                if nepoch > 3:
                    # Freeze quantizer parameters
                    model.apply(torch.quantization.disable_observer)
                if nepoch > 2:
                    # Freeze batch norm mean and variance estimates
                    model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)

            return
        model.module.fuse_model()
        from lpot import Quantization, common
        quantizer = Quantization(args.config)
        quantizer.model = common.Model(model)
        quantizer.q_func = training_func_for_lpot
        quantizer.eval_dataloader = val_loader
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)
        return

    if args.benchmark:
        model.eval()
        model.module.fuse_model()
        if args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model)
        else:
            new_model = model
        validate(val_loader, new_model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                                                    and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            }, is_best)
Exemple #30
0
                path_imgrec=dataset,
                label_width=1,
                preprocess_threads=data_nthreads,
                batch_size=batch_size,
                data_shape=data_shape,
                label_name=label_name,
                rand_crop=False,
                rand_mirror=False,
                shuffle=args.shuffle_dataset,
                shuffle_chunk_seed=args.shuffle_chunk_seed,
                seed=args.shuffle_seed,
                dtype=data_layer_type,
                ctx=args.ctx,
                **combine_mean_std)
            quantizer = Quantization("./cnn.yaml")
            quantizer.model = common.Model(fp32_model)
            quantizer.calib_dataloader = calib_data
            quantizer.eval_dataloader = data
            q_model = quantizer()
            q_model.save(args.output_graph)
            sys.exit()

        if args.accuracy_only:
            symbol_file = args.symbol_file
            param_file = args.param_file
            sym, arg_params, aux_params = load_model(symbol_file, param_file)
            score(sym,
                  arg_params,
                  aux_params,
                  data, [ctx],
                  label_name,