Exemple #1
0
    def _test_conv_bf16_base(self, dim):
        conv_module = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
        input_shapes = {1: (224,), 2: (224, 224), 3: (55, 55, 55)}
        options = itertools.product([True, False], [1, 2], [1, 4])
        for bias, dilation, groups in options:
            N = torch.randint(3, 10, (1,)).item()
            M = torch.randint(1, 3, (1,)).item() * groups
            C = torch.randint(1, 3, (1,)).item() * groups
            x_shape = (N, C) + input_shapes[dim]
            x = torch.randn(x_shape, dtype=torch.float32)

            conv = conv_module[dim](in_channels=C,
                                    out_channels=M,
                                    kernel_size=3,
                                    stride=2,
                                    padding=1,
                                    dilation=dilation,
                                    bias=bias,
                                    groups=groups).float()
            x_bf16 = x.bfloat16()
            if has_bf16_support():
                mkldnn_conv = mkldnn_utils.to_mkldnn(copy.deepcopy(conv))
                mkldnn_conv_bf16 = mkldnn_utils.to_mkldnn(copy.deepcopy(conv), torch.bfloat16)
                y = mkldnn_conv(x.to_mkldnn()).to_dense()
                y_bf16 = mkldnn_conv_bf16(x_bf16.to_mkldnn()).to_dense(torch.float32)
                self.assertEqual(y, y_bf16, atol=1e-1, rtol=1e-3)
            else:
                msg = r"bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"
                with self.assertRaisesRegex(RuntimeError, msg):
                    mkldnn_conv_bf16 = mkldnn_utils.to_mkldnn(copy.deepcopy(conv), torch.bfloat16)
                    y_bf16 = mkldnn_conv_bf16(x_bf16.to_mkldnn()).to_dense(torch.float32)
Exemple #2
0
def main_4():

	# check MKL-DNN
	print(*torch.__config__.show().split('\n'), sep = '\n')
	print('=======')

	orig_model = models.resnet101(False)
	input1 = torch.rand(100, 3, 224, 224)

	result = orig_model(input1)
	result = orig_model(input1)
	result = orig_model(input1)
	result = orig_model(input1)

	start_time = timeit.default_timer()
	result = orig_model(input1)
	run_time = timeit.default_timer() - start_time

	print('Python (CPU): {}'.format(run_time))

	orig_model.eval()
	mkldnn_model = mkldnn_utils.to_mkldnn(orig_model)
	input1 = input1.to_mkldnn()
	answer = torch.zeros(100, 1000).to_mkldnn()

	result = mkldnn_model(input1)
	result = mkldnn_model(input1)
	result = mkldnn_model(input1)
	result = mkldnn_model(input1)

	start_time = timeit.default_timer()
	result = mkldnn_model(input1)
	run_time = timeit.default_timer() - start_time

	print('Python (MKL-DNN): {}'.format(run_time))
Exemple #3
0
    def test_0_dimension_tensor(self):
        x = torch.rand([20, 20, 1, 1], dtype=torch.float)
        y = torch.rand([20, 20, 0, 1], dtype=torch.float)

        # unary ops work without modification
        out_relu = torch.relu(y)
        out_relu_mkldnn = torch.relu(y.to_mkldnn()).to_dense()
        self.assertEqual(out_relu, out_relu_mkldnn)

        out_mul = x * y
        out_mul_mkldnn = (x.to_mkldnn() * y.to_mkldnn()).to_dense()
        self.assertEqual(out_mul, out_mul_mkldnn)

        out_add = x + y
        out_add_mkldnn = (x.to_mkldnn() + y.to_mkldnn()).to_dense()
        self.assertEqual(out_add, out_add_mkldnn)

        x.requires_grad_(True)
        y.requires_grad_(True)
        with self.assertRaisesRegex(RuntimeError,
                                    "0-dimension Tensor in training"):
            x.to_mkldnn() + y.to_mkldnn()

        with self.assertRaisesRegex(RuntimeError, "must match"):
            torch.rand([5]).to_mkldnn() + torch.rand([0]).to_mkldnn()

        C = 7
        m = torch.nn.Conv2d(C, C, 3)
        x = torch.randn(0, C, C, 8, dtype=torch.float)
        out_eager = m(x)
        out_mkldnn = mkldnn_utils.to_mkldnn(m)(x)
        self.assertEqual(out_eager, out_mkldnn)
 def setup_data_and_model(self):
     x_train, y_train = self.load_data()
     args = [
         self.device,
         self.params["tensor_layout"],
         self.params["problem"]["precision"],
     ]
     self.x_train = [set_batch_device_precision(i, *args) for i in x_train]
     self.y_train = [set_batch_device_precision(i, *args) for i in y_train]
     if self.params["problem"]["precision"] == "FP16":
         self.net.half()
     if self.params["backend"] == "DNNL":
         torch.backends.mkldnn.enabled = True
         self.net.eval(
         )  # This is to make it not fail when DNLL does not support train
         if self.params["tensor_layout"] == "DNNL":
             self.net = mkldnn_utils.to_mkldnn(self.net)
         else:
             logger.warning("Using DNNL backend without DNNL tensors")
     else:
         if self.params["backend"] == "native":
             torch.backends.mkldnn.enabled = False
             assert self.params["tensor_layout"] == "native"
         else:
             raise RuntimeError("Unknown backend")
Exemple #5
0
    def _test_conv_base(self, dim):
        conv_module = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
        input_shapes = {1: (224,), 2: (224, 224), 3: (55, 55, 55)}
        options = itertools.product([True, False], [1, 2], [1, 4])
        for bias, dilation, groups in options:
            N = torch.randint(3, 10, (1,)).item()
            M = torch.randint(1, 3, (1,)).item() * groups
            C = torch.randint(1, 3, (1,)).item() * groups
            x_shape = (N, C) + input_shapes[dim]
            x = torch.randn(x_shape, dtype=torch.float32)
            conv = conv_module[dim](in_channels=C,
                                    out_channels=M,
                                    kernel_size=3,
                                    stride=2,
                                    padding=1,
                                    dilation=dilation,
                                    bias=bias,
                                    groups=groups).float()
            mkldnn_conv = mkldnn_utils.to_mkldnn(copy.deepcopy(conv))
            with torch.backends.mkldnn.flags(enabled=False):
                y_aten = conv(x)
            y_mkldnn = mkldnn_conv(x.to_mkldnn()).to_dense()
            self.assertEqual(y_aten, y_mkldnn)

            self._test_serialization(mkldnn_conv, (x.to_mkldnn(),))
            self._test_tracing(mkldnn_conv, (x.to_mkldnn(),))
    def _test_batch_norm_base(self, dim, channels, input):
        bn_module = {2: torch.nn.BatchNorm2d, 3: torch.nn.BatchNorm3d}
        bn = bn_module[dim](channels).float().train(False)
        mkldnn_bn = mkldnn_utils.to_mkldnn(copy.deepcopy(bn))
        self.assertEqual(bn(input), mkldnn_bn(input.to_mkldnn()).to_dense())

        self._test_serialization(mkldnn_bn, (input.to_mkldnn(), ))
        self._test_tracing(mkldnn_bn, (input.to_mkldnn(), ))
Exemple #7
0
 def _test_imagenet_model(self, model):
     model = model.train(False).float()
     mkldnn_model = mkldnn_utils.to_mkldnn(copy.deepcopy(model))
     x = torch.randn(1, 3, 224, 224, dtype=torch.float32)
     with torch.no_grad():
         self.assertEqual(
             model(x),
             mkldnn_model(x.to_mkldnn()).to_dense(),
         )
Exemple #8
0
    def test_linear(self):
        in_features = torch.randint(3, 10, (1, )).item()
        out_features = torch.randint(3, 100, (1, )).item()
        x = torch.randn(3, in_features, dtype=torch.float32) * 10

        for bias in [True, False]:
            linear = torch.nn.Linear(in_features, out_features).float()
            mkldnn_linear = mkldnn_utils.to_mkldnn(copy.deepcopy(linear))
            self.assertEqual(linear(x),
                             mkldnn_linear(x.to_mkldnn()).to_dense())
Exemple #9
0
    def test_linear_bf16(self):
        in_features = torch.randint(3, 10, (1,)).item()
        out_features = torch.randint(3, 100, (1,)).item()
        x = torch.randn(3, in_features, dtype=torch.float32) * 10
        x_bf16 = x.bfloat16()

        for bias in [True, False]:
            linear = torch.nn.Linear(in_features, out_features, bias=bias).float()
            mkldnn_linear = mkldnn_utils.to_mkldnn(copy.deepcopy(linear))
            mkldnn_linear_bf16 = mkldnn_utils.to_mkldnn(copy.deepcopy(linear), torch.bfloat16)
            if has_bf16_support():
                y = mkldnn_linear(x.to_mkldnn()).to_dense()
                y_bf16 = mkldnn_linear_bf16(x_bf16.to_mkldnn()).to_dense(torch.float32)
                self.assertEqual(y, y_bf16, atol=1e-1, rtol=1e-3)
            else:
                msg = "mkldnn_linear: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"
                self.assertRaisesRegex(RuntimeError,
                                       msg,
                                       lambda: mkldnn_linear_bf16(x_bf16.to_mkldnn()))
Exemple #10
0
    def test_batch_norm2d(self):
        N = torch.randint(3, 10, (1, )).item()
        C = torch.randint(3, 100, (1, )).item()
        x = torch.randn(N, C, 35, 45, dtype=torch.float32) * 10

        # TODO: support training
        for train in [False]:
            bn = torch.nn.BatchNorm2d(C).float().train(train)
            mkldnn_bn = mkldnn_utils.to_mkldnn(copy.deepcopy(bn))
            self.assertEqual(bn(x), mkldnn_bn(x.to_mkldnn()).to_dense())
Exemple #11
0
 def _test_conv_base(self, dim):
     conv_module = {
         1: torch.nn.Conv1d,
         2: torch.nn.Conv2d,
         3: torch.nn.Conv3d
     }
     input_shapes = {1: (224, ), 2: (224, 224), 3: (55, 55, 55)}
     options = itertools.product([True, False], [True, False], [1, 2],
                                 [1, 4])
     for train, bias, dilation, groups in options:
         N = torch.randint(3, 10, (1, )).item()
         M = torch.randint(1, 3, (1, )).item() * groups
         C = torch.randint(1, 3, (1, )).item() * groups
         x_shape = (N, C) + input_shapes[dim]
         x = torch.randn(x_shape, dtype=torch.float32)
         conv = conv_module[dim](in_channels=C,
                                 out_channels=M,
                                 kernel_size=3,
                                 stride=2,
                                 padding=1,
                                 dilation=dilation,
                                 bias=bias,
                                 groups=groups).float()
         x1 = x.clone()
         x2 = x.clone().to_mkldnn()
         if not train:
             mkldnn_conv = mkldnn_utils.to_mkldnn(copy.deepcopy(conv))
         elif train and dim != 1:
             # TODO: enable conv1d training.
             x1.requires_grad_()
             x2.requires_grad_()
             mkldnn_conv = copy.deepcopy(conv)
         with torch.backends.mkldnn.flags(enabled=False):
             y_aten = conv(x1)
             if train and dim != 1:
                 loss1 = y_aten.sum()
                 loss1.backward()
         if not train or (train and dim != 1):
             y_mkldnn = mkldnn_conv(x2).to_dense()
             self.assertEqual(y_aten, y_mkldnn)
         if not train:
             self._test_serialization(mkldnn_conv, (x.to_mkldnn(), ))
             self._test_tracing(mkldnn_conv, (x.to_mkldnn(), ))
         elif dim != 1:
             loss2 = y_mkldnn.sum()
             loss2.backward()
             self.assertTrue(x2.grad.is_mkldnn)
             self.assertEqual(x1.grad, x2.grad.to_dense())
             self.assertEqual(conv.weight.grad,
                              mkldnn_conv.weight.grad,
                              atol=1e-3,
                              rtol=1e-3)
             if bias:
                 self.assertEqual(conv.bias.grad, mkldnn_conv.bias.grad)
Exemple #12
0
 def network(self, x):
     """ convert imgs to torch/mxnet and run network model and return numpy """
     X = self._to_device(x)
     if self.torch:
         self.net.eval()
         if self.mkldnn:
             self.net = mkldnn_utils.to_mkldnn(self.net)
     y, style = self.net(X)
     if self.mkldnn:
         self.net.to(torch_CPU)
     y = self._from_device(y)
     style = self._from_device(style)
     return y,style
Exemple #13
0
 def _test_batch_norm_bf16_base(self, dim, channels, input):
     bn_module = {2: torch.nn.BatchNorm2d, 3: torch.nn.BatchNorm3d}
     x_bf16 = input.bfloat16()
     # TODO: support training
     for train in [False]:
         bn = bn_module[dim](channels).float().train(train)
         mkldnn_bn = mkldnn_utils.to_mkldnn(copy.deepcopy(bn))
         if has_bf16_support():
             y = bn(input.to_mkldnn().to_dense())
             y_bf16 = bn(input.to_mkldnn().to_dense(torch.float))
             self.assertEqual(y, y_bf16, atol=1e-1, rtol=1e-3)
         else:
             msg = "mkldnn_batch_norm: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"
             self.assertRaisesRegex(RuntimeError, msg,
                                    lambda: bn(x_bf16.to_mkldnn()))
Exemple #14
0
    def test_reshape_blocked_format(self):
        # construct an mkldnn blocked tensor with mkldnn conv2d
        C = 7
        m = mkldnn_utils.to_mkldnn(torch.nn.Conv2d(C, C, 3))
        x = torch.randn(1, C, 8, 8).to_mkldnn()

        # mkldnn tensor w/ blocked format
        y_block = m(x)
        # aten tensor w/ plain format
        y_plain = y_block.to_dense()

        y_block_reshape = y_block.reshape(C, -1)
        y_plain_reshape = y_plain.reshape(C, -1)

        self.assertEqual(y_plain_reshape, y_block_reshape.to_dense())
Exemple #15
0
def pytorch_benchmark(net,
                      width,
                      height,
                      number_iter,
                      input_folder,
                      need_output=False,
                      output_folder='',
                      task_type='',
                      batch_size=1):
    filenames = os.listdir(input_folder)
    inference_times = []
    number_iter = (number_iter + batch_size - 1) // batch_size
    images, counts = load_images(width, height, input_folder,
                                 number_iter * batch_size)

    net = mkldnn_utils.to_mkldnn(net)

    t0_total = time()
    for i in range(number_iter):
        a = (i * batch_size) % len(images)
        b = (((i + 1) * batch_size - 1) % len(images)) + 1

        blob = images[a:b]
        blob = blob.to_mkldnn()

        t0 = time()

        output = net(blob)

        t1 = time()

        output = output.to_dense()

        if (need_output == True and batch_size == 1):
            # Generate output name
            output_filename = str(
                os.path.splitext(os.path.basename(filenames[i]))[0]) + '.npy'
            output_filename = os.path.join(os.path.dirname(output_folder),
                                           output_filename)
            # Save output
            print(output.shape)
            print(np.argmax(np.array(output)[0]))
            #np.savetxt(output_filename, output)

        inference_times.append(t1 - t0)
    t1_total = time()
    inference_total_time = t1_total - t0_total
    return inference_times, inference_total_time
Exemple #16
0
def main():
    '''
    The following 3 components are required to perform training.
    1. model: Instantiate model class
    2. optim: Optimization function for update topology parameters during training
    3. crite: Criterion function to minimize loss
    '''
    model = TestModel()
    optim = torch.optim.SGD(model.parameters(), lr=0.01)
    crite = nn.MSELoss(reduction='sum')
    '''
    1. Instantiate the Dataset class defined before
    2. Use torch.utils.data.DataLoader to load data from the Dataset instance
    '''
    train_data = TestDataset()
    trainLoader = DataLoader(train_data, batch_size=BS_TRAIN)
    test_data = TestDataset(train=False)
    testLoader = DataLoader(test_data, batch_size=BS_TEST)
    '''
    Perform training and inference
    Use model.train() to set the model into train mode. Use model.eval() to set the model into inference mode.
    Use for loop with enumerate(instance of DataLoader) to go through the whole dataset for training/inference.
    '''
    for i in range(0, EPOCHNUM - 1):
        model.train()
        for batch_index, (data, y_ans) in enumerate(trainLoader):
            '''
            1. Clear parameters of optimization function
            2. Do forward-propagation
            3. Calculate loss of the forward-propagation with the criterion function
            4. Calculate gradients with the backward() function
            5. Update parameters of the model with the optimization function
            '''
            optim.zero_grad()
            y = model(data)
            loss = crite(y, y_ans)
            loss.backward()
            optim.step()

        model.eval()
        '''
        1. User is suggested to use JIT mode to get best performance with MKL-DNN with minimum change of Pytorch code. User may need to pass an explicit flag or invoke a specific MKL-DNN optimization pass. The PyTorch MKL-DNN JIT backend is under development (RFC link https://github.com/pytorch/pytorch/issues/23657), so the example below is given in imperative mode.
        2. To have model accelerated by MKL-DNN under imperative mode, user needs to explicitly insert format conversion for MKL-DNN operations using tensor.to_mkldnn() and to_dense(). For best result, user needs to insert the format conversion on the boundary of a sequence of MKL-DNN operations. This could boost performance significantly.
        3. For inference task, user needs to prepack the model▒~@~Ys weight using mkldnn_utils.to_mkldnn(model) to save the weight format conversion overhead. It could bring good performance gain sometime for single batch inference.
        '''
        model_mkldnn = mkldnn.to_mkldnn(model)
        for batch_index, data in enumerate(testLoader):
            y = model_mkldnn(data.to_mkldnn())
Exemple #17
0
 def test_conv2d(self):
     for groups in [1, 4]:
         N = torch.randint(3, 10, (1, )).item()
         C = torch.randint(1, 3, (1, )).item() * groups
         M = torch.randint(1, 3, (1, )).item() * groups
         x = torch.randn(N, C, 224, 224, dtype=torch.float32) * 100
         for bias in [True, False]:
             conv2d = torch.nn.Conv2d(in_channels=C,
                                      out_channels=M,
                                      kernel_size=3,
                                      stride=2,
                                      padding=1,
                                      bias=bias,
                                      groups=groups).float()
             mkldnn_conv2d = mkldnn_utils.to_mkldnn(copy.deepcopy(conv2d))
             self.assertEqual(conv2d(x),
                              mkldnn_conv2d(x.to_mkldnn()).to_dense())
Exemple #18
0
 def network(self, x, return_conv=False):
     """ convert imgs to torch/mxnet and run network model and return numpy """
     X = self._to_device(x)
     if self.torch:
         self.net.eval()
         if self.mkldnn:
             self.net = mkldnn_utils.to_mkldnn(self.net)
     y, style, conv = self.net(X)
     if self.mkldnn:
         self.net.to(torch_CPU)
     y = self._from_device(y)
     style = self._from_device(style)
     if return_conv:
         conv = self._from_device(conv)
         y = np.concatenate((y, conv), axis=1)
     
     return y, style
Exemple #19
0
    def test_conv3d(self):
        for groups in [1, 4]:
            N = torch.randint(3, 10, (1, )).item()
            C = torch.randint(1, 3, (1, )).item() * groups
            M = torch.randint(1, 3, (1, )).item() * groups
            x = torch.randn(N, C, 55, 55, 55, dtype=torch.float32)
            for bias in [True, False]:
                conv3d = torch.nn.Conv3d(in_channels=C,
                                         out_channels=M,
                                         kernel_size=3,
                                         stride=2,
                                         padding=1,
                                         bias=bias,
                                         groups=groups).float()
                mkldnn_conv3d = mkldnn_utils.to_mkldnn(copy.deepcopy(conv3d))
                with torch.backends.mkldnn.flags(enabled=False):
                    y_aten = conv3d(x)
                y_mkldnn = mkldnn_conv3d(x.to_mkldnn()).to_dense()
                self.assertEqual(y_aten, y_mkldnn)

                self._test_serialization(mkldnn_conv3d, (x.to_mkldnn(), ))
                self._test_tracing(mkldnn_conv3d, (x.to_mkldnn(), ))
Exemple #20
0
    def test_conv2d(self):
        options = itertools.product([1, 4], [True, False], [1, 2])
        for groups, bias, dilation in options:
            N = torch.randint(3, 10, (1, )).item()
            C = torch.randint(1, 3, (1, )).item() * groups
            M = torch.randint(1, 3, (1, )).item() * groups
            x = torch.randn(N, C, 224, 224, dtype=torch.float32)
            conv2d = torch.nn.Conv2d(in_channels=C,
                                     out_channels=M,
                                     kernel_size=3,
                                     stride=2,
                                     padding=1,
                                     dilation=dilation,
                                     bias=bias,
                                     groups=groups).float()
            mkldnn_conv2d = mkldnn_utils.to_mkldnn(copy.deepcopy(conv2d))
            with torch.backends.mkldnn.flags(enabled=False):
                y_aten = conv2d(x)
            y_mkldnn = mkldnn_conv2d(x.to_mkldnn()).to_dense()
            self.assertEqual(y_aten, y_mkldnn)

            self._test_serialization(mkldnn_conv2d, (x.to_mkldnn(), ))
            self._test_tracing(mkldnn_conv2d, (x.to_mkldnn(), ))
def benchmark():
    # benchmark settings
    parser = argparse.ArgumentParser(description='PyTorch Convnet Benchmark')
    parser.add_argument('--arch',
                        action='store',
                        default='all',
                        choices=archs_list + ['all'],
                        help='model name can be specified. all is default.')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disable CUDA')
    parser.add_argument('--mkldnn',
                        action='store_true',
                        default=False,
                        help='use mkldnn weight cache')
    parser.add_argument('--inference',
                        action='store_true',
                        default=False,
                        help='run inference only')
    parser.add_argument('--single-batch-size',
                        action='store_true',
                        default=False,
                        help='single batch size')
    parser.add_argument('--print-iteration-time',
                        action='store_true',
                        default=False,
                        help='print iteration time')

    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    arch_dict = {
        args.arch: archs[args.arch]
    } if args.arch in archs_list else archs  # by huiming, support one or all models.

    if args.cuda:
        import torch.backends.cudnn as cudnn
        cudnn.benchmark = True
        cudnn.deterministic = True

        kernel = 'cudnn'
        p = subprocess.check_output('nvidia-smi --query-gpu=name --format=csv',
                                    shell=True)
        device_name = str(p).split('\\n')[1]
    else:
        kernel = 'nn'
        p = subprocess.check_output(
            'cat /proc/cpuinfo | grep name | head -n 1', shell=True)
        device_name = str(p).split(':')[1][:-3]

    print('Running on device: %s' % (device_name))
    print('Running on torch: %s' % (torch.__version__))
    print('Running on torchvision: %s\n' % (torchvision.__version__))

    def _time():
        if args.cuda:
            torch.cuda.synchronize()

        return time.time()

    for arch, sizes in arch_dict.items():
        if args.mkldnn and arch != 'resnet50' and arch != 'resnext101_32x8d':
            continue

        if arch == 'unet3d':
            batch_size, c, d, h, w = sizes[0], sizes[1], sizes[2], sizes[
                3], sizes[4]
            batch_size = 1 if args.single_batch_size else batch_size
            print('ModelType: %s, Kernels: %s Input shape: %dx%dx%dx%dx%d' %
                  (arch, kernel, batch_size, c, d, h, w))
            data = torch.randn(batch_size, c, d, h, w)
        else:
            batch_size, c, h, w = sizes[0], sizes[1], sizes[2], sizes[3]
            batch_size = 64 if arch is 'resnet50' and args.inference else batch_size
            batch_size = 1 if args.single_batch_size else batch_size
            print('ModelType: %s, Kernels: %s Input shape: %dx%dx%dx%d' %
                  (arch, kernel, batch_size, c, h, w))
            data = torch.randn(batch_size, c, h, w)

        target = torch.arange(1, batch_size + 1).long()
        net = models.__dict__[arch](
        )  # no need to load pre-trained weights for dummy data

        optimizer = optim.SGD(net.parameters(), lr=0.01)
        criterion = nn.CrossEntropyLoss()

        if args.cuda:
            data, target = data.cuda(), target.cuda()
            net.cuda()
            criterion = criterion.cuda()

        if args.mkldnn:
            data = data.to_mkldnn()
            if args.inference:
                net = mkldnn_utils.to_mkldnn(net)

        if args.inference:
            net.eval()
        else:
            net.train()
            net.aux_logits = False

        for i in range(nDryRuns):
            optimizer.zero_grad()  # zero the gradient buffers
            output = net(data)
            if not args.inference:
                if args.mkldnn:
                    output = output.to_dense()
                loss = output.sum() / 1e6 if 'unet' in arch else criterion(
                    output, target)
                loss.backward()
                optimizer.step()  # Does the update

        time_fwd, time_bwd, time_upt = 0, 0, 0

        for i in range(steps):
            optimizer.zero_grad()  # zero the gradient buffers
            t1 = _time()
            output = net(data)
            t2 = _time()
            if not args.inference:
                if args.mkldnn:
                    output = output.to_dense()
                loss = output.sum() / 1e6 if 'unet' in arch else criterion(
                    output, target)
                loss.backward()
                t3 = _time()
                optimizer.step()  # Does the update
                t4 = _time()
            time_fwd = time_fwd + (t2 - t1)
            if args.print_iteration_time:
                print("%-30s %d: %10.2f ms" % ('forward iteration', i,
                                               (t2 - t1) * 1000))
            if not args.inference:
                time_bwd = time_bwd + (t3 - t2)
                time_upt = time_upt + (t4 - t3)

        time_fwd_avg = time_fwd / steps * 1000
        time_bwd_avg = time_bwd / steps * 1000
        time_upt_avg = time_upt / steps * 1000

        # update not included!
        time_total = time_fwd_avg + time_bwd_avg

        print("%-30s %10s %10.2f (ms) %10.2f (imgs/s)" %
              (kernel, ':forward:', time_fwd_avg,
               batch_size * 1000 / time_fwd_avg))
        print("%-30s %10s %10.2f (ms)" % (kernel, ':backward:', time_bwd_avg))
        print("%-30s %10s %10.2f (ms)" % (kernel, ':update:', time_upt_avg))
        print("%-30s %10s %10.2f (ms) %10.2f (imgs/s)" %
              (kernel, ':total:', time_total, batch_size * 1000 / time_total))
Exemple #22
0
def validate(val_loader,
             model,
             criterion,
             args,
             is_INT8=False,
             is_calibration=False):
    if is_calibration:
        iterations = args.iter_calib
        warmup = 0
    else:
        iterations = args.iterations
        warmup = args.warmup_iterations
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5],
                             prefix='Test: ')

    # switch to evaluate mode
    model.eval()
    if args.evaluate and args.mkldnn and not args.cuda and not is_INT8:
        if args.bf16:
            model = mkldnn_utils.to_mkldnn(model, torch.bfloat16)
        else:
            model = mkldnn_utils.to_mkldnn(model)
        # TODO using mkldnn weight cache

    if args.dummy:
        images = torch.randn(args.batch_size, 3, 224, 224)
        target = torch.arange(1, args.batch_size + 1).long()

        if not is_INT8:
            if args.gpu is not None and args.cuda:
                images = images.cuda(args.gpu, non_blocking=True)
            if args.cuda:
                target = target.cuda(args.gpu, non_blocking=True)

            if args.bf16 and not args.cuda:
                images = images.to_mkldnn(torch.bfloat16)
            elif args.mkldnn and not args.cuda:
                images = images.to_mkldnn()

        if args.ipex:
            images = images.to(device='dpcpp:0')

        number_iter = len(val_loader)
        with torch.no_grad():
            for i in range(number_iter):
                if not args.evaluate or iterations == 0 or i < iterations + warmup:
                    if i >= warmup:
                        end = time.time()
                    # compute output
                    output = model(images)

                    # measure elapsed time
                    if i >= warmup:
                        batch_time.update(time.time() - end)

                    if args.mkldnn and not args.cuda and not is_INT8:
                        output = output.to_dense()

                    loss = criterion(output, target)

                    # measure accuracy and record loss
                    acc1, acc5 = accuracy(output, target, topk=(1, 5))

                    losses.update(loss.item(), images.size(0))
                    top1.update(acc1[0], images.size(0))
                    top5.update(acc5[0], images.size(0))

                    if i % args.print_freq == 0:
                        progress.display(i)
                elif i == iterations + warmup:
                    break

            if args.profile:
                # print("export profiling file to {}".format(torch.backends.quantized.engine + "_result.json"))
                with torch.autograd.profiler.profile() as prof:
                    output = model(images)
                prof.export_chrome_trace(torch.backends.quantized.engine +
                                         "_result.json")
                table_res = prof.key_averages().table(sort_by="cpu_time_total")
                print(table_res)
                save_profile_result(
                    torch.backends.quantized.engine + "_result_average.xlsx",
                    table_res)

            # TODO: this should also be done with the ProgressMeter
            if args.evaluate:
                batch_size = val_loader.batch_size
                latency = batch_time.avg / batch_size * 1000
                perf = batch_size / batch_time.avg
                print('inference latency %3.0f ms' % latency)
                print('inference performance %3.0f fps' % perf)

            print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(
                top1=top1, top5=top5))

    else:
        with torch.no_grad():
            for i, (images, target) in enumerate(val_loader):
                if not args.evaluate or iterations == 0 or i < iterations + warmup:
                    if i >= warmup:
                        end = time.time()
                    if not is_INT8:
                        if args.gpu is not None and args.cuda:
                            images = images.cuda(args.gpu, non_blocking=True)
                        if args.cuda:
                            target = target.cuda(args.gpu, non_blocking=True)

                        if args.bf16 and not args.cuda:
                            images = images.to_mkldnn(torch.bfloat16)
                        elif args.mkldnn and not args.cuda:
                            images = images.to_mkldnn()

                    if args.ipex:
                        images = images.to(device='dpcpp:0')

                    # compute output
                    output = model(images)

                    # measure elapsed time
                    if i >= warmup:
                        batch_time.update(time.time() - end)

                    if args.mkldnn and not args.cuda and not is_INT8:
                        output = output.to_dense()

                    loss = criterion(output, target)

                    # measure accuracy and record loss
                    acc1, acc5 = accuracy(output, target, topk=(1, 5))

                    losses.update(loss.item(), images.size(0))
                    top1.update(acc1[0], images.size(0))
                    top5.update(acc5[0], images.size(0))

                    if i % args.print_freq == 0:
                        progress.display(i)
                elif i == iterations + warmup:
                    break

            if args.profile:
                # print("export profiling file to {}".format(torch.backends.quantized.engine + "_result.json"))
                with torch.autograd.profiler.profile() as prof:
                    output = model(images)
                prof.export_chrome_trace(torch.backends.quantized.engine +
                                         "_result.json")
                table_res = prof.key_averages().table(sort_by="cpu_time_total")
                print(table_res)
                save_profile_result(
                    torch.backends.quantized.engine + "_result_average.xlsx",
                    table_res)

            # TODO: this should also be done with the ProgressMeter
            if args.evaluate:
                batch_size = val_loader.batch_size
                latency = batch_time.avg / batch_size * 1000
                perf = batch_size / batch_time.avg
                print('inference latency %3.0f ms' % latency)
                print('inference performance %3.0f fps' % perf)

            print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(
                top1=top1, top5=top5))

    return top1.avg
Exemple #23
0
def evaluate(args, model, tokenizer, prefix="", calibration=False):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    dataset_cached = "./dataset_cached"
    if not os.path.exists(dataset_cached):
        os.makedirs(dataset_cached)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    calibation_iteration = int(
        (len(dataset) * 0.05 + args.eval_batch_size - 1) /
        args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    print("  Batch size = %d" % args.eval_batch_size)

    if args.mkldnn_eval:
        from torch.utils import mkldnn as mkldnn_utils
        model = mkldnn_utils.to_mkldnn(model)
        print(model)

    all_results = []
    evalTime = 0
    nb_eval_steps = 0

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        if calibration and nb_eval_steps >= calibation_iteration:
            break

        with torch.no_grad():
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

            if args.model_type != 'distilbert':
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        2]  # XLM don't use segment_ids

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})

            if nb_eval_steps >= args.warmup:
                start_time = timeit.default_timer()
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(unique_id,
                                     start_logits,
                                     end_logits,
                                     start_top_index=start_top_index,
                                     end_top_index=end_top_index,
                                     cls_logits=cls_logits)

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

        if nb_eval_steps >= args.warmup:
            evalTime += (timeit.default_timer() - start_time)

        nb_eval_steps += 1

        if args.iter > 0 and nb_eval_steps >= (args.warmup + args.iter):
            break

    if nb_eval_steps >= args.warmup:
        perf = (nb_eval_steps - args.warmup) * args.eval_batch_size / evalTime
        if args.eval_batch_size == 1:
            print('Latency: %.3f ms' % (evalTime /
                                        (nb_eval_steps - args.warmup) * 1000))
        print("Evaluation done in total %f secs (Throughput: %f samples/sec)" %
              (evalTime, perf))
    else:
        logger.info(
            "*****no performance, please check dataset length and warmup number *****"
        )

    # Compute predictions
    output_prediction_file = os.path.join(dataset_cached,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        dataset_cached, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            dataset_cached, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ['xlnet', 'xlm']:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, start_n_top, end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    elif not calibration and args.iter == 0:
        predictions = compute_predictions_logits(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, args.do_lower_case, output_prediction_file,
            output_nbest_file, output_null_log_odds_file, args.verbose_logging,
            args.version_2_with_negative, args.null_score_diff_threshold)

    # Compute the F1 and exact scores.
    if not calibration and args.iter == 0:
        results = squad_evaluate(examples, predictions)
        bert_task_acc_keys = ['best_f1', 'f1', 'mcc', 'spearmanr', 'acc']
        for key in bert_task_acc_keys:
            if key in results.keys():
                acc = results[key]
                break
        print("Accuracy: %.5f" % acc)
    else:
        results = None
    return results, perf
Exemple #24
0
def inference(model, dataloader, datatype, args):
    batch_time = AverageMeter('Time', ':6.3f')
    batch_size = args.batch_size
    warmup_iters = args.warmup_iterations
    max_iters = args.max_iterations if dataloader is None else len(dataloader)
    model.eval()
    coco = get_coco_api_from_dataset(dataloader.dataset)
    iou_types = ["bbox"]
    iou_types.append("segm")
    coco_evaluator = CocoEvaluator(coco, iou_types)
    if args.ipex:
        import intel_extension_for_pytorch as ipex
        model = model.to(memory_format=torch.channels_last)
        model = ipex.optimize(model,
                              dtype=datatype,
                              level="O1",
                              conv_bn_folding=False,
                              replace_dropout_with_identity=False)
        model.backbone = ipex.optimize(model.backbone,
                                       dtype=datatype,
                                       level="O1")
    else:
        if args.jit:
            model = model.to(memory_format=torch.channels_last)
        else:
            from torch.utils import mkldnn as mkldnn_utils
            model = mkldnn_utils.to_mkldnn(model, dtype=datatype)
    if args.jit:
        x = torch.randn(batch_size, 3, 1200,
                        1200).to(memory_format=torch.channels_last)
        if args.precision == "bf16":
            with torch.cpu.amp.autocast(), torch.no_grad():
                model.backbone = torch.jit.trace(model.backbone,
                                                 x,
                                                 strict=False)
            model.backbone = torch.jit.freeze(model.backbone)
        else:
            with torch.no_grad():
                model.backbone = torch.jit.trace(model.backbone,
                                                 x,
                                                 strict=False)
            model.backbone = torch.jit.freeze(model.backbone)
    with torch.no_grad():
        if dataloader is None:
            print(
                "Models for detection tasks need to use real dataset. You need to specify coco dataset. "
            )
            exit(1)
        else:
            for i, batch in enumerate(dataloader):
                images = batch[0]
                if not args.ipex and not args.jit:
                    images = list(img.to(datatype) for img in images)
                if args.ipex and args.precision == "bf16":
                    with torch.cpu.amp.autocast():
                        if i == warmup_iters:
                            with profile(
                                    activities=[ProfilerActivity.CPU],
                                    record_shapes=True
                            ) as prof, record_function("model_inference"):
                                output = model(images)
                        else:
                            output = model(images)
                else:
                    if i == warmup_iters:
                        with profile(
                                activities=[ProfilerActivity.CPU],
                                record_shapes=True) as prof, record_function(
                                    "model_inference"):
                            output = model(images)
                    else:
                        output = model(images)
                if i > warmup_iters:
                    break
            for i, batch in enumerate(dataloader):
                images = batch[0]
                end = time.time()
                if not args.ipex and not args.jit:
                    images = list(img.to(datatype) for img in images)
                if args.ipex and args.precision == "bf16":
                    with torch.cpu.amp.autocast():
                        output = model(images)
                else:
                    output = model(images)
                batch_time.update(time.time() - end)
                output = [{k: v.to(torch.float32)
                           for k, v in t.items()} for t in output]
                res = {
                    target["image_id"].item(): output
                    for target, output in zip(batch[1], output)
                }
                coco_evaluator.update(res)
                if max_iters != -1 and i >= max_iters:
                    break
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1))
    latency = batch_time.avg / batch_size * 1000
    perf = batch_size / batch_time.avg
    coco_evaluator.synchronize_between_processes()
    coco_evaluator.accumulate()
    coco_evaluator.summarize()
    print("Bbox AP: {:.5f} ".format(coco_evaluator.coco_eval['bbox'].stats[0]))
    print("Segm AP: {:.5f} ".format(coco_evaluator.coco_eval['segm'].stats[0]))
    print('Latency: %.3f ms' % latency)
    print("Throughput: {:.3f} fps".format(perf))
def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None

        if args.mkldnn:
            from torch.utils import mkldnn as mkldnn_utils
            model = mkldnn_utils.to_mkldnn(model)
            print(model)

        if args.cached_weights:
            for batch in tqdm(eval_dataloader, desc="Tracing"):
                model.eval()
                batch = tuple(t.to(args.device) for t in batch)

                with torch.no_grad():
                    # inputs = {'input_ids':      batch[0],
                    #           'attention_mask': batch[1],
                    #           'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                    #           'labels':         batch[3]}
                    # traced = torch.jit.trace(model, **inputs, check_trace=False)
                    traced = torch.jit.trace(model, (batch[0], batch[2], batch[1], batch[3]), check_trace=False)
                    script = traced.save('jit_model.pt')
                    break

            model = torch.jit.load('jit_model.pt')
        total_time = 0
        num = 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                          'labels':         batch[3]}
                t0 = time.time()
                if args.cached_weights:
                    outputs = model(batch[0], batch[2], batch[1], batch[3])
                else:
                    outputs = model(**inputs)
                if num > 50:
                    total_time += time.time() - t0
                num += 1
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

        print('{} batch/s'.format((num-50)/total_time))

        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return results
Exemple #26
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    print("Use GPU: {} for training".format(args.gpu))
    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            print(os.environ["RANK"])
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        print("args.dist_backend {}".format(args.dist_backend))
        print("args.dist_url {}".format(args.dist_url))
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            if use_gpu:
                model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            if use_gpu:
                print("create DistributedDataParallel")
                model = torch.nn.parallel.DistributedDataParallel(model)
            else:
                model.cpu()
                print("create DistributedDataParallelCPU")
                model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    if use_gpu:
        criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    else:
        criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # support mkldnn
    if (args.mkldnn and not args.cuda):
        print("##############mkldnn##############")
        model = mkldnn_utils.to_mkldnn(model)
        optimizer_util.to_mkldnn(optimizer)

        print("using mkldnn model\n")

    if use_gpu:
        cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        print("run epoch '{}'".format(epoch))
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if (args.rank == 0):
            if (args.mkldnn and not args.cuda):
                model = mkldnn_utils.to_dense(model)
                optimizer_util.to_dense(optimizer)
            print("#################save#################")
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
            if (args.mkldnn and not args.cuda):
                model = mkldnn_utils.to_mkldnn(model)
                optimizer_util.to_mkldnn(optimizer)
Exemple #27
0
def main(args):
    if args.ipex:
        import intel_pytorch_extension as ipex
        if args.fp16:
            ipex.enable_auto_mixed_precision(mixed_dtype=torch.bfloat16)

    use_amp = False
    if not args.no_cuda and torch.cuda.is_available():
        device = torch.device('cuda')
        if args.fp16:
            use_amp = True
    elif args.ipex:
        device = ipex.DEVICE
    else:
        device = torch.device('cpu')

    log('Using PyTorch version: %s, Device: %s' % (torch.__version__, device))
    log(torch.__config__.show())

    cudnn.benchmark = True

    # Set up standard model.
    log('Initializing %s model...' % args.model)
    model = getattr(models, args.model)()
    model = model.to(device)
    if args.multi_gpu and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
        log('Using %d GPUs with torch.nn.DataParallel' %
            torch.cuda.device_count())

    if args.mkldnn:
        model = mkldnn_utils.to_mkldnn(model)

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    imsize = 224
    if args.model == 'inception_v3':
        imsize = 299

    def benchmark_step():
        #data, target = next(iter(loader))
        data = torch.randn(args.batch_size, 3, imsize, imsize)
        target = torch.LongTensor(args.batch_size).random_() % 1000

        if args.mkldnn:
            data = data.to_mkldnn()

        data = data.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=use_amp):
            output = model(data)
            if args.mkldnn:
                output = output.to_dense()
            if args.model == 'inception_v3':
                loss = F.cross_entropy(output.logits, target)
            else:
                loss = F.cross_entropy(output, target)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    log('Model: %s' % args.model)
    log('Batch size: %d' % args.batch_size)

    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec' % (x, img_sec))
        img_secs.append(img_sec)

    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Total img/sec %.1f +-%.1f' % (img_sec_mean, img_sec_conf))
Exemple #28
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list;"
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument("--mkldnn_eval",
                        action='store_true',
                        help="evaluation with MKLDNN")
    parser.add_argument("--mkldnn_train",
                        action='store_true',
                        help="training with MKLDNN")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument("--do_fp32_inference",
                        action='store_true',
                        help="Whether to run fp32 inference.")
    parser.add_argument("--do_calibration",
                        action='store_true',
                        help="Whether to do calibration.")
    parser.add_argument("--do_int8_inference",
                        action='store_true',
                        help="Whether to run int8 inference.")
    parser.add_argument("--do_bf16",
                        action='store_true',
                        help="run bf16 evaluation / training.")
    parser.add_argument("--tune",
                        action='store_true',
                        help="run ilit to tune int8 acc.")
    parser.add_argument("--warmup",
                        type=int,
                        default=2,
                        help="warmup for performance")

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)
    mix_qkv = False
    if args.do_calibration or args.do_int8_inference or args.tune:
        mix_qkv = True
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        mix_qkv=mix_qkv,
        bf16=args.do_bf16,
        mkldnn_train=args.mkldnn_train,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""

            logger.info("Evaluate:" + args.task_name)
            if args.mkldnn_eval or args.do_fp32_inference or args.do_bf16:
                model = model_class.from_pretrained(checkpoint)
                model.to(args.device)
                result = evaluate(args, model, tokenizer, prefix=prefix)
                result = dict((k + '_{}'.format(global_step), v)
                              for k, v in result.items())
                results.update(result)

            if args.tune:

                def eval_func_for_ilit(model):
                    result, perf = evaluate(args,
                                            model,
                                            tokenizer,
                                            prefix=prefix)
                    bert_task_acc_keys = [
                        'acc_and_f1', 'f1', 'mcc', 'spearmanr', 'acc'
                    ]
                    for key in bert_task_acc_keys:
                        if key in result.keys():
                            logger.info("Finally Eval {}:{}".format(
                                key, result[key]))
                            acc = result[key]
                            break
                    return acc

                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                eval_task_names = (
                    "mnli", "mnli-mm") if args.task_name == "mnli" else (
                        args.task_name, )

                for eval_task in eval_task_names:
                    eval_dataset = load_and_cache_examples(args,
                                                           eval_task,
                                                           tokenizer,
                                                           evaluate=True)

                    args.eval_batch_size = args.per_gpu_eval_batch_size * max(
                        1, args.n_gpu)
                    # multi-gpu eval
                    if args.n_gpu > 1:
                        model = torch.nn.DataParallel(model)

                    if args.mkldnn_eval:
                        from torch.utils import mkldnn as mkldnn_utils
                        model = mkldnn_utils.to_mkldnn(model)
                        print(model)
                    import ilit
                    tuner = ilit.Tuner("./conf.yaml")
                    if eval_task != "squad":
                        eval_task = 'classifier'
                    eval_dataset = tuner.dataset('bert',
                                                 dataset=eval_dataset,
                                                 task=eval_task)
                    test_dataloader = tuner.dataloader(
                        eval_dataset, batch_size=args.eval_batch_size)
                    tuner.tune(model,
                               test_dataloader,
                               eval_func=eval_func_for_ilit)
                exit(0)

            if args.do_calibration:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                fallback_layers = {}
                if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc":
                    fallback_layers = {"bert.encoder.layer.9.output.dense."}
                propagate_qconfig_(model)
                fallback_layer(model,
                               layer_name="",
                               exculde_layers=fallback_layers)
                add_observer_(model)
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step,
                                     calibration=True)
                convert(model, inplace=True)
                quantized_model_path = args.task_name + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    os.makedirs(quantized_model_path)
                model.save_pretrained(quantized_model_path)
                print(model)
                result, _ = evaluate(args, model, tokenizer, prefix=prefix)
            if args.do_int8_inference:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                fallback_layers = {}
                if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc":
                    fallback_layers = {"bert.encoder.layer.9.output.dense."}
                propagate_qconfig_(model)
                fallback_layer(model,
                               layer_name="",
                               exculde_layers=fallback_layers)
                add_observer_(model)
                convert(model, inplace=True)
                quantized_model_path = args.task_name + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    logger.error(
                        "please do calibrantion befor run int8 inference")
                    return
                prepare(model, inplace=True)
                convert(model, inplace=True)
                model_bin_file = os.path.join(quantized_model_path,
                                              "pytorch_model.bin")
                state_dict = torch.load(model_bin_file)
                model.load_state_dict(state_dict)
                result, _ = evaluate(args, model, tokenizer, prefix=prefix)

    return results
Exemple #29
0
def evaluate(args, model, tokenizer, prefix="", calibration=False):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )

    results = {}
    for eval_task in eval_task_names:
        eval_dataset = load_and_cache_examples(args,
                                               eval_task,
                                               tokenizer,
                                               evaluate=True)

        if calibration:
            args.eval_batch_size = 16
        else:
            args.eval_batch_size = args.per_gpu_eval_batch_size * max(
                1, args.n_gpu)
        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        calibation_iteration = int(
            (len(eval_dataset) * 0.05 + args.eval_batch_size - 1) /
            args.eval_batch_size)
        # multi-gpu eval
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        if args.mkldnn_eval:
            from torch.utils import mkldnn as mkldnn_utils
            model = mkldnn_utils.to_mkldnn(model)
            print(model)

        import timeit
        total_time = 0.0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
            if calibration and nb_eval_steps >= calibation_iteration:
                break
            with torch.no_grad():
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[3]
                }
                if args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if args.model_type in [
                        'bert', 'xlnet'
                    ] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                if nb_eval_steps >= args.warmup:
                    start = timeit.default_timer()
                outputs = model(**inputs)
                if nb_eval_steps >= args.warmup:
                    total_time += (timeit.default_timer() - start)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if args.do_bf16:
                if preds is None:
                    preds = logits.detach().cpu().to(torch.float).numpy()
                    out_label_ids = inputs['labels'].detach().cpu().to(
                        torch.float).numpy()
                else:
                    preds = np.append(preds,
                                      logits.detach().cpu().to(
                                          torch.float).numpy(),
                                      axis=0)
                    out_label_ids = np.append(
                        out_label_ids,
                        inputs['labels'].detach().cpu().to(
                            torch.float).numpy(),
                        axis=0)
            else:
                if preds is None:
                    preds = logits.detach().cpu().numpy()
                    out_label_ids = inputs['labels'].detach().cpu().numpy()
                else:
                    preds = np.append(preds,
                                      logits.detach().cpu().numpy(),
                                      axis=0)
                    out_label_ids = np.append(
                        out_label_ids,
                        inputs['labels'].detach().cpu().numpy(),
                        axis=0)
        if nb_eval_steps >= args.warmup:
            perf = (len(eval_dataloader) -
                    args.warmup) * args.eval_batch_size / total_time
            logger.info("***** perfformance {} samples/s *****".format(perf))
        else:
            logger.info(
                "*****no perfformance, please check dataset length and warmup number *****"
            )
        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))

    return results, perf
Exemple #30
0
def main():
    args = parser.parse_args()
    print(args)
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    if args.cuda and args.mkldnn:
        assert False, "We can not runing this work on GPU backend and MKLDNN backend \
                please set one backend.\n"

    if args.cuda:
        print("Using GPU backend to do this work.\n")
    elif args.mkldnn:
        print("Using MKLDNN backend to do this work.\n")
    else:
        print("Using native CPU backend to do this work.\n")

    # set it to the folder where video files are saved
    video_dir = args.video_dir + "/UCF-101"
    # set it to the folder where dataset splitting files are saved
    splits_dir = args.video_dir + "/ucfTrainTestlist"
    # set it to the file path for saving the metadata
    metadata_file = args.video_dir + "/metadata.pth"

    resnext3d_configs =model_config.ResNeXt3D_Config(video_dir, splits_dir, metadata_file, args.num_epochs)
    resnext3d_configs.setUp()

    datasets = {}
    dataset_train_configs = resnext3d_configs.dataset_configs["train"]
    dataset_test_configs = resnext3d_configs.dataset_configs["test"]
    dataset_train_configs["batchsize_per_replica"] = args.batch_size_train
    # For testing, batchsize per replica should be equal to clips_per_video
    dataset_test_configs["batchsize_per_replica"] = args.batch_size_eval
    dataset_test_configs["clips_per_video"] = args.batch_size_eval

    datasets["train"] = build_dataset(dataset_train_configs)
    datasets["test"] = build_dataset(dataset_test_configs)

    model = build_model(resnext3d_configs.model_configs)
    meters = build_meters(resnext3d_configs.meters_configs)
    loss = build_loss({"name": "CrossEntropyLoss"})
    optimizer = build_optimizer(resnext3d_configs.optimizer_configs)

    # there some ops are not supported by MKLDNN, so convert input to CPU tensor
    if args.mkldnn:
        heads_configs = resnext3d_configs.model_configs['heads'][0]
        in_plane = heads_configs['in_plane']
        num_classes = heads_configs['num_classes']
        act_func = heads_configs['activation_func']
        mkldnn_head_fcl = MkldnnFullyConvolutionalLinear(in_plane, num_classes, act_func)

        if args.evaluate:
            model = model.eval()
            model = mkldnn_utils.to_mkldnn(model)
            model._heads['pathway0-stage4-block2']['default_head'].head_fcl = mkldnn_head_fcl.eval()
        else:
            model._heads['pathway0-stage4-block2']['default_head'].head_fcl = mkldnn_head_fc

    # print(model)
    if args.evaluate:
        validata(datasets, model, loss, meters, args)
        return

    train(datasets, model, loss, optimizer, meters, args)