Exemple #1
0
def main():
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    print(args)
    if args.ipex:
        print(
            "enabling intel pytorch extension path will get good performance")
        import intel_pytorch_extension
        import intel_pytorch_extension as ipex
    if args.precision == 'bf16':
        assert args.ipex, 'please first enable ipex by add option --ipex to run bfloat16 path'
        print("enabling intel pytorch extension mix precision(fp32+bf16)path")
        ipex.enable_auto_mixed_precision(
            mixed_dtype=torch.bfloat16, train=False if args.evaluate else True)

    if args.jit:
        assert args.evaluate, 'jit fusion path only support evaluation step'

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        if args.cuda:
            cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None and args.cuda:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count() if args.cuda else 0
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker,
                 nprocs=ngpus_per_node,
                 args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)
Exemple #2
0
    def __init__(
        self,
        precision_plugin: PrecisionPlugin = PrecisionPlugin(),
        training_type_plugin: TrainingTypePlugin = SingleDevicePlugin(
            torch.device(ipex.DEVICE)),
        enable_bf16=False,
    ) -> None:
        """

        Args:
            precision_plugin: the plugin to handle precision-specific parts
            training_type_plugin: the plugin to handle different training routines
        """
        if enable_bf16:
            # Automatically mix precision
            ipex.enable_auto_mixed_precision(mixed_dtype=torch.bfloat16)

        self.device = ipex.DEVICE

        super().__init__(precision_plugin=precision_plugin,
                         training_type_plugin=training_type_plugin)
Exemple #3
0
    def do_warmup(self):
        print('Start warmup...')
        length_list = {}
        count = 0
        idxs = self.qsl.idxs()
        for i in idxs:
            feature_list = []
            feature_length_list = []
            waveform = self.qsl[i]
            feature_element, feature_length = self.audio_preprocessor.forward(
                (torch.from_numpy(waveform).unsqueeze(0),
                 torch.tensor(len(waveform)).unsqueeze(0)))
            feature_list.append(feature_element.squeeze(0).transpose_(0, 1))
            feature_length_list.append(feature_length.squeeze(0))
            feature = torch.nn.utils.rnn.pad_sequence(feature_list,
                                                      batch_first=True)
            feature_length = torch.tensor(feature_length_list)

            if feature_length[0].item() in length_list:
                continue
            length_list[feature_length[0].item()] = True

            assert feature.ndim == 3
            assert feature_length.ndim == 1
            if self.ipex:
                import intel_pytorch_extension as ipex
                if self.bf16:
                    ipex.enable_auto_mixed_precision(
                        mixed_dtype=torch.bfloat16)
                ipex.core.enable_auto_dnnl()
                feature = feature.to(ipex.DEVICE)
                feature_length = feature_length.to(ipex.DEVICE)
            feature_ = feature.permute(1, 0, 2)
            _, _, transcripts = self.greedy_decoder.forward_batch(
                feature_, feature_length, self.rank)

            count += 1
            if self.rank == 0 and count % 10 == 0:
                print('Warmup {} samples'.format(count))
        print('Warmup done')
Exemple #4
0
import cv2
import glob
import time
import torch
import numpy as np
from postprocess import get_seg
from models.detectors.solov2 import SOLOv2

import warnings
warnings.filterwarnings('ignore')

import intel_pytorch_extension as ipex
# Automatically mix precision
ipex.enable_auto_mixed_precision(mixed_dtype=torch.bfloat16)


def imnormalize_(img, mean, std, to_rgb=True):
    """Inplace normalize an image with mean and std.

    Args:
        img (ndarray): Image to be normalized.
        mean (ndarray): The mean to be used for normalize.
        std (ndarray): The std to be used for normalize.
        to_rgb (bool): Whether to convert to rgb.

    Returns:
        ndarray: The normalized image.
    """
    # cv2 inplace normalization does not accept uint8
    assert img.dtype != np.uint8
    mean = np.float64(mean.reshape(1, -1))
 def __exit__(self, *args, **kwargs):
     if self.old_value:
         ipex.enable_auto_mixed_precision(mixed_dtype=torch.bfloat16,
                                          train=self.train_old_value)
     else:
         ipex.enable_auto_mixed_precision(mixed_dtype=None)
 def __enter__(self):
     if self.enable_or_not:
         ipex.enable_auto_mixed_precision(mixed_dtype=torch.bfloat16,
                                          train=self.train)
     else:
         ipex.enable_auto_mixed_precision(mixed_dtype=None)
Exemple #7
0
def main(
    args,
    init_distributed=False,
    after_distributed_init_fn: Optional[Callable[[argparse.Namespace],
                                                 argparse.Namespace]] = None,
):
    utils.import_user_module(args)

    assert (
        args.max_tokens is not None or args.max_sentences is not None
    ), "Must specify batch size either with --max-tokens or --max-sentences"
    metrics.reset()

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu and not getattr(
            args, "tpu", False):
        torch.cuda.set_device(args.device_id)

    if args.ipex:
        import intel_pytorch_extension as ipex
        if args.dnnl:
            ipex.core.enable_auto_dnnl()
        else:
            ipex.core.disable_auto_dnnl()
        if args.mix_precision:
            ipex.enable_auto_mixed_precision(mixed_dtype=torch.bfloat16,
                                             train=True)

    np.random.seed(args.seed)
    utils.set_torch_seed(args.seed)
    if init_distributed:
        args.distributed_rank = distributed_utils.distributed_init(args)
        if after_distributed_init_fn:
            args = after_distributed_init_fn(args)

    if distributed_utils.is_master(args):
        checkpoint_utils.verify_checkpoint_directory(args.save_dir)

    # Print args
    logger.info(args)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest checkpoint)
    for valid_sub_split in args.valid_subset.split(","):
        task.load_dataset(valid_sub_split, combine=False, epoch=1)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    logger.info(model)
    logger.info("model {}, criterion {}".format(args.arch,
                                                criterion.__class__.__name__))
    logger.info("num. model params: {} (num. trained: {})".format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    ))

    # (optionally) Configure quantization
    if args.quantization_config_path is not None:
        quantizer = quantization_utils.Quantizer(
            config_path=args.quantization_config_path,
            max_epoch=args.max_epoch,
            max_update=args.max_update,
        )
    else:
        quantizer = None

    # Build trainer
    if args.model_parallel_size == 1:
        trainer = Trainer(args, task, model, criterion, quantizer)
    else:
        trainer = MegatronTrainer(args, task, model, criterion)

    logger.info("training on {} devices (GPUs/TPUs)".format(
        args.distributed_world_size))
    logger.info(
        "max tokens per GPU = {} and max sentences per GPU = {}".format(
            args.max_tokens, args.max_sentences))

    # Load the latest checkpoint if one is available and restore the
    # corresponding train iterator
    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer)
    if args.tpu:
        import torch_xla.core.xla_model as xm

        xm.rendezvous("load_checkpoint")  # wait for all workers
        xm.mark_step()

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    lr = trainer.get_lr()
    train_meter = meters.StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch:
        # train for one epoch
        valid_losses, should_stop = train(args, trainer, task, epoch_itr)
        if should_stop:
            break

        # only use first validation loss to update the learning rate
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        epoch_itr = trainer.get_train_iterator(
            epoch_itr.next_epoch_idx,
            # sharded data: get train iterator for next epoch
            load_dataset=(os.pathsep in getattr(args, "data", "")),
        )
    train_meter.stop()
    logger.info("done training in {:.1f} seconds".format(train_meter.sum))
Exemple #8
0
def main(args):
    if args.ipex:
        import intel_pytorch_extension as ipex
        if args.fp16:
            ipex.enable_auto_mixed_precision(mixed_dtype=torch.bfloat16)

    use_amp = False
    if not args.no_cuda and torch.cuda.is_available():
        device = torch.device('cuda')
        if args.fp16:
            use_amp = True
    elif args.ipex:
        device = ipex.DEVICE
    else:
        device = torch.device('cpu')

    log('Using PyTorch version: %s, Device: %s' % (torch.__version__, device))
    log(torch.__config__.show())

    cudnn.benchmark = True

    # Set up standard model.
    log('Initializing %s model...' % args.model)
    model = getattr(models, args.model)()
    model = model.to(device)
    if args.multi_gpu and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
        log('Using %d GPUs with torch.nn.DataParallel' %
            torch.cuda.device_count())

    if args.mkldnn:
        model = mkldnn_utils.to_mkldnn(model)

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    imsize = 224
    if args.model == 'inception_v3':
        imsize = 299

    def benchmark_step():
        #data, target = next(iter(loader))
        data = torch.randn(args.batch_size, 3, imsize, imsize)
        target = torch.LongTensor(args.batch_size).random_() % 1000

        if args.mkldnn:
            data = data.to_mkldnn()

        data = data.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=use_amp):
            output = model(data)
            if args.mkldnn:
                output = output.to_dense()
            if args.model == 'inception_v3':
                loss = F.cross_entropy(output.logits, target)
            else:
                loss = F.cross_entropy(output, target)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    log('Model: %s' % args.model)
    log('Batch size: %d' % args.batch_size)

    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec' % (x, img_sec))
        img_secs.append(img_sec)

    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Total img/sec %.1f +-%.1f' % (img_sec_mean, img_sec_conf))
Exemple #9
0
    def run(self):
        core_list = range(self.start_core, self.end_core + 1)
        num_cores = len(core_list)
        os.sched_setaffinity(self.pid, core_list)
        cmd = "taskset -p -c %d-%d %d" % (self.start_core, self.end_core,
                                          self.pid)
        print(cmd)
        os.system(cmd)
        os.environ['OMP_NUM_THREADS'] = '{}'.format(self.end_core -
                                                    self.start_core + 1)
        print("### set rank {} to cores [{}:{}]; omp num threads = {}".format(
            self.rank, self.start_core, self.end_core, num_cores))

        torch.set_num_threads(num_cores)

        if not self.model_init:
            print("lazy_init rank {}".format(self.rank))
            config = toml.load(self.config_toml)
            dataset_vocab = config['labels']['labels']
            rnnt_vocab = add_blank_label(dataset_vocab)
            featurizer_config = config['input_eval']
            self.audio_preprocessor = AudioPreprocessing(**featurizer_config)
            self.audio_preprocessor.eval()
            self.audio_preprocessor = torch.jit.script(self.audio_preprocessor)
            self.audio_preprocessor = torch.jit._recursive.wrap_cpp_module(
                torch._C._freeze_module(self.audio_preprocessor._c))

            model = RNNT(feature_config=featurizer_config,
                         rnnt=config['rnnt'],
                         num_classes=len(rnnt_vocab))
            checkpoint = torch.load(self.checkpoint_path, map_location="cpu")
            migrated_state_dict = {}
            for key, value in checkpoint['state_dict'].items():
                key = key.replace("joint_net", "joint.net")
                migrated_state_dict[key] = value
            del migrated_state_dict["audio_preprocessor.featurizer.fb"]
            del migrated_state_dict["audio_preprocessor.featurizer.window"]
            model.load_state_dict(migrated_state_dict, strict=True)

            if self.ipex:
                import intel_pytorch_extension as ipex
                if self.bf16:
                    ipex.enable_auto_mixed_precision(
                        mixed_dtype=torch.bfloat16)
                ipex.core.enable_auto_dnnl()
                model = model.to(ipex.DEVICE)

            model.eval()
            if not self.ipex:
                model.encoder = torch.jit.script(model.encoder)
                model.encoder = torch.jit._recursive.wrap_cpp_module(
                    torch._C._freeze_module(model.encoder._c))
                model.prediction = torch.jit.script(model.prediction)
                model.prediction = torch.jit._recursive.wrap_cpp_module(
                    torch._C._freeze_module(model.prediction._c))
            model.joint = torch.jit.script(model.joint)
            model.joint = torch.jit._recursive.wrap_cpp_module(
                torch._C._freeze_module(model.joint._c))
            if not self.ipex:
                model = torch.jit.script(model)

            self.greedy_decoder = ScriptGreedyDecoder(
                len(rnnt_vocab) - 1, model)

            self.model_init = True

        if self.warmup:
            self.do_warmup()

        self.lock.acquire()
        self.init_counter.value += 1
        self.lock.release()

        if self.rank == 0 and self.cosim:
            print('Running with cosim mode, performance will be slow!!!')
        if self.rank == 0 and self.profile:
            print('Start profiler')
            with profiler.profile(record_shapes=True) as prof:
                self.run_queue(debug=True)
            print(prof.key_averages().table(sort_by="self_cpu_time_total",
                                            row_limit=20))
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=20))
            print(
                prof.key_averages(group_by_input_shape=True).table(
                    sort_by="self_cpu_time_total", row_limit=40))
            print(
                prof.key_averages(group_by_input_shape=True).table(
                    sort_by="cpu_time_total", row_limit=40))
            while self.run_queue():
                pass
        else:
            while self.run_queue():
                pass
Exemple #10
0
    def run_queue(self, debug=False):
        next_task = self.task_queue.get()
        if next_task is None:
            self.task_queue.task_done()
            return False

        query_id_list = next_task.query_id_list
        query_idx_list = next_task.query_idx_list
        query_len = len(query_id_list)
        with torch.no_grad():
            t1 = time.time()
            serial_audio_processor = True
            if serial_audio_processor:
                feature_list = []
                feature_length_list = []
                for idx in query_idx_list:
                    waveform = self.qsl[idx]
                    feature_element, feature_length = self.audio_preprocessor.forward(
                        (torch.from_numpy(waveform).unsqueeze(0),
                         torch.tensor(len(waveform)).unsqueeze(0)))
                    feature_list.append(
                        feature_element.squeeze(0).transpose_(0, 1))
                    feature_length_list.append(feature_length.squeeze(0))
                feature = torch.nn.utils.rnn.pad_sequence(feature_list,
                                                          batch_first=True)
                feature_length = torch.tensor(feature_length_list)
            else:
                waveform_list = []
                for idx in query_idx_list:
                    waveform = self.qsl[idx]
                    waveform_list.append(torch.from_numpy(waveform))
                waveform_batch = torch.nn.utils.rnn.pad_sequence(
                    waveform_list, batch_first=True)
                waveform_lengths = torch.tensor(
                    [waveform.shape[0] for waveform in waveform_list],
                    dtype=torch.int64)

                feature, feature_length = self.audio_preprocessor.forward(
                    (waveform_batch, waveform_lengths))

            assert feature.ndim == 3
            assert feature_length.ndim == 1
            if self.ipex:
                import intel_pytorch_extension as ipex
                if self.bf16:
                    ipex.enable_auto_mixed_precision(
                        mixed_dtype=torch.bfloat16)
                ipex.core.enable_auto_dnnl()
                feature = feature.to(ipex.DEVICE)
                feature_length = feature_length.to(ipex.DEVICE)
            if serial_audio_processor:
                feature_ = feature.permute(1, 0, 2)
            else:
                feature_ = feature.permute(2, 0, 1)
            t3 = time.time()
            if query_len == 1:
                _, _, transcripts = self.greedy_decoder.forward_single_batch(
                    feature_, feature_length, self.ipex, self.rank)
            else:
                _, _, transcripts = self.greedy_decoder.forward_batch(
                    feature_, feature_length, self.ipex, self.rank)
            t4 = time.time()
            # cosim
            if self.cosim:
                _, _, transcripts0 = self.greedy_decoder.forward(
                    feature, feature_length)
                if transcripts0 != transcripts:
                    print(
                        'vvvvvv difference between reference and batch impl. vvvvvv'
                    )
                    for i in range(query_len):
                        if transcripts0[i] != transcripts[i]:
                            for j in range(len(transcripts0[i])):
                                if transcripts0[i][j] != transcripts[i][j]:
                                    break
                            print('[{}] reference'.format(i))
                            print('{} diff {}'.format(transcripts0[i][0:j],
                                                      transcripts0[i][j:]))
                            print('[{}] batch'.format(i))
                            print('{} diff {}'.format(transcripts[i][0:j],
                                                      transcripts[i][j:]))
                            print('')
                    print(
                        '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
                    )
                else:
                    print('.', end='', flush=True)

        t6 = time.time()
        assert len(transcripts) == query_len
        for id, trans in zip(query_id_list, transcripts):
            self.result_queue.put(Output(id, trans))
        t2 = time.time()
        dur = t2 - t1
        if debug:
            print('Audio {} Infer {} Total {}'.format(t3 - t1, t4 - t3,
                                                      t2 - t1))
            if query_len > 1:
                print("#### rank {} finish {} sample in {:.3f} sec".format(
                    self.rank, query_len, dur))
            else:
                print(
                    "#### rank {} finish sample of feature_len={} in {:.3f} sec"
                    .format(self.rank, feature_length[0].item(), dur))

        self.task_queue.task_done()
        return True