def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.fp16: optim_level = Optimization.mxprO3 else: optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, multi_gpu=False) audio_preprocessor = AudioPreprocessing(**featurizer_config) audio_preprocessor.eval() eval_transforms = torchvision.transforms.Compose([ lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) eval( data_layer=data_layer, audio_processor=eval_transforms, args=args)
def __init__(self, config_toml, checkpoint_path, dataset_dir, manifest_filepath, perf_count): config = toml.load(config_toml) dataset_vocab = config['labels']['labels'] rnnt_vocab = add_blank_label(dataset_vocab) featurizer_config = config['input_eval'] self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) self.qsl = AudioQSLInMemory(dataset_dir, manifest_filepath, dataset_vocab, featurizer_config["sample_rate"], perf_count) self.audio_preprocessor = AudioPreprocessing(**featurizer_config) self.audio_preprocessor.eval() self.audio_preprocessor = torch.jit.script(self.audio_preprocessor) self.audio_preprocessor = torch.jit._recursive.wrap_cpp_module( torch._C._freeze_module(self.audio_preprocessor._c)) model = RNNT( feature_config=featurizer_config, rnnt=config['rnnt'], num_classes=len(rnnt_vocab) ) model.load_state_dict(load_and_migrate_checkpoint(checkpoint_path), strict=True) model.eval() model.encoder = torch.jit.script(model.encoder) model.encoder = torch.jit._recursive.wrap_cpp_module( torch._C._freeze_module(model.encoder._c)) model.prediction = torch.jit.script(model.prediction) model.prediction = torch.jit._recursive.wrap_cpp_module( torch._C._freeze_module(model.prediction._c)) model.joint = torch.jit.script(model.joint) model.joint = torch.jit._recursive.wrap_cpp_module( torch._C._freeze_module(model.joint._c)) model = torch.jit.script(model) self.greedy_decoder = ScriptGreedyDecoder(len(rnnt_vocab) - 1, model)
def main(args): random.seed(args.seed) np.random.seed(args.seed) #torch.set_default_dtype(torch.double) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark #print("CUDNN BENCHMARK ", args.cudnn_benchmark) if args.cuda: assert(torch.cuda.is_available()) model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" #print('model_config') #print_dict(model_definition) #print('feature_config') #print_dict(featurizer_config) data_layer = None data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, sampler='bucket' #sort by duration ) audio_preprocessor = AudioPreprocessing(**featurizer_config) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None and args.mode in[3]: #print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) audio_preprocessor.featurizer.normalize = "per_feature" if args.cuda: audio_preprocessor.cuda() audio_preprocessor.eval() eval_transforms = [] if args.cuda: eval_transforms.append(lambda xs: [xs[0].cuda(),xs[1].cuda(), *xs[2:]]) eval_transforms.append(lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]]) # These are just some very confusing transposes, that's all. # BxFxT -> TxBxF eval_transforms.append(lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]]) eval_transforms = torchvision.transforms.Compose(eval_transforms) if args.cuda: model.cuda() # Ideally, I would jit this as well... But this is just the constructor... greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model) eval( data_layer=data_layer, audio_processor=eval_transforms, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark multi_gpu = args.local_rank is not None if multi_gpu: print("DISTRIBUTED with ", torch.distributed.get_world_size()) if args.fp16: optim_level = Optimization.mxprO3 else: optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" print('model_config') print_dict(model_definition) print('feature_config') print_dict(featurizer_config) data_layer = None if args.wav is None: data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, # sampler='bucket', sort_by_duration=args.sort_by_duration, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, multi_gpu=multi_gpu) audio_preprocessor = AudioPreprocessing(**featurizer_config) #encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None: print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) if args.ipex: import intel_extension_for_pytorch as ipex from rnn import IPEXStackTime model.joint_net.eval() data_type = torch.bfloat16 if args.mix_precision else torch.float32 if model.encoder["stack_time"].factor == 2: model.encoder["stack_time"] = IPEXStackTime(model.encoder["stack_time"].factor) model.joint_net = ipex.optimize(model.joint_net, dtype=data_type, auto_kernel_selection=True) model.prediction["embed"] = model.prediction["embed"].to(data_type) if args.jit: print("running jit path") model.joint_net.eval() if args.mix_precision: with torch.cpu.amp.autocast(), torch.no_grad(): model.joint_net = torch.jit.trace(model.joint_net, torch.randn(args.batch_size, 1, 1, model_definition['rnnt']['encoder_n_hidden'] + model_definition['rnnt']['pred_n_hidden']), check_trace=False) else: with torch.no_grad(): model.joint_net = torch.jit.trace(model.joint_net, torch.randn(args.batch_size, 1, 1, model_definition['rnnt']['encoder_n_hidden'] + model_definition['rnnt']['pred_n_hidden']), check_trace=False) model.joint_net = torch.jit.freeze(model.joint_net) else: model = model.to("cpu") #greedy_decoder = GreedyCTCDecoder() # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) if args.wav is None: N = len(data_layer) # step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_available() else torch.distributed.get_world_size()))) step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) if args.steps is not None: print('-----------------') # print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_available() else torch.distributed.get_world_size()))) print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) print('Have {0} warm up steps / (gpu * epoch).'.format(args.warm_up)) print('Have {0} measure steps / (gpu * epoch).'.format(args.steps)) print('-----------------') else: print('-----------------') print('Have {0} examples to eval on.'.format(N)) print('Have {0} warm up steps / (gpu * epoch).'.format(args.warm_up)) print('Have {0} measure steps / (gpu * epoch).'.format(step_per_epoch)) print('-----------------') else: audio_preprocessor.featurizer.normalize = "per_feature" print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize) audio_preprocessor.eval() # eval_transforms = torchvision.transforms.Compose([ # lambda xs: [x.to(ipex.DEVICE) if args.ipex else x.cpu() for x in xs], # lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], # lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], # ]) eval_transforms = torchvision.transforms.Compose([ lambda xs: [x.cpu() for x in xs], lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) model.eval() if args.ipex: ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model) greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) eval( data_layer=data_layer, audio_processor=eval_transforms, encoderdecoder=model, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args, multi_gpu=multi_gpu)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark print("CUDNN BENCHMARK ", args.cudnn_benchmark) assert(torch.cuda.is_available()) if args.local_rank is not None: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') multi_gpu = args.local_rank is not None if multi_gpu: print("DISTRIBUTED with ", torch.distributed.get_world_size()) if args.fp16: optim_level = Optimization.mxprO3 else: optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" print('model_config') print_dict(model_definition) print('feature_config') print_dict(featurizer_config) data_layer = None if args.wav is None: data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, multi_gpu=multi_gpu) audio_preprocessor = AudioPreprocessing(**featurizer_config) #encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None: print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) #greedy_decoder = GreedyCTCDecoder() # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) if args.wav is None: N = len(data_layer) step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) if args.steps is not None: print('-----------------') print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) print('Have {0} steps / (gpu * epoch).'.format(args.steps)) print('-----------------') else: print('-----------------') print('Have {0} examples to eval on.'.format(N)) print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch)) print('-----------------') else: audio_preprocessor.featurizer.normalize = "per_feature" print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize) audio_preprocessor.cuda() audio_preprocessor.eval() eval_transforms = torchvision.transforms.Compose([ lambda xs: [x.cuda() for x in xs], lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) model.cuda() if args.fp16: model = amp.initialize( models=model, opt_level=AmpOptimizations[optim_level]) model = model_multi_gpu(model, multi_gpu) greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) eval( data_layer=data_layer, audio_processor=eval_transforms, encoderdecoder=model, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args, multi_gpu=multi_gpu)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) args.local_rank = os.environ.get('LOCAL_RANK', args.local_rank) # set up distributed training cpu_distributed_training = False if torch.distributed.is_available() and int(os.environ.get('PMI_SIZE', '0')) > 1: print('Distributed training with DDP') os.environ['RANK'] = os.environ.get('PMI_RANK', '0') os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1') if not 'MASTER_ADDR' in os.environ: os.environ['MASTER_ADDR'] = args.master_addr if not 'MASTER_PORT' in os.environ: os.environ['MASTER_PORT'] = args.port # Initialize the process group with ccl backend if args.backend == 'ccl': import torch_ccl dist.init_process_group( backend=args.backend ) cpu_distributed_training = True if torch.distributed.is_initialized(): print("Torch distributed is initialized.") args.rank = torch.distributed.get_rank() args.world_size = torch.distributed.get_world_size() else: print("Torch distributed is not initialized.") args.rank = 0 args.world_size = 1 multi_gpu = False if multi_gpu: print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size())) optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) train_manifest = args.train_manifest val_manifest = args.val_manifest tst_manifest = args.tst_manifest featurizer_config = model_definition['input'] featurizer_config_eval = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level featurizer_config_eval["optimization_level"] = optim_level sampler_type = featurizer_config.get("sampler", 'default') perturb_config = model_definition.get('perturb', None) if args.pad_to_max: assert(args.max_duration > 0) featurizer_config['max_duration'] = args.max_duration featurizer_config_eval['max_duration'] = args.max_duration featurizer_config['pad_to'] = "max" featurizer_config_eval['pad_to'] = "max" print_once('model_config') print_dict(model_definition) if args.gradient_accumulation_steps < 1: raise ValueError('Invalid gradient accumulation steps parameter {}'.format(args.gradient_accumulation_steps)) if args.batch_size % args.gradient_accumulation_steps != 0: raise ValueError('gradient accumulation step {} is not divisible by batch size {}'.format(args.gradient_accumulation_steps, args.batch_size)) preprocessor = preprocessing.AudioPreprocessing(**featurizer_config) if args.cuda: preprocessor.cuda() else: preprocessor.cpu() augmentations = preprocessing.SpectrogramAugmentation(**featurizer_config) if args.cuda: augmentations.cuda() else: augmentations.cpu() train_transforms = torchvision.transforms.Compose([ lambda xs: [x.cpu() for x in xs], lambda xs: [*preprocessor(xs[0:2]), *xs[2:]], lambda xs: [augmentations(xs[0]), *xs[1:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) eval_transforms = torchvision.transforms.Compose([ lambda xs: [x.cpu() for x in xs], lambda xs: [*preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, perturb_config=perturb_config, manifest_filepath=train_manifest, labels=dataset_vocab, batch_size=args.batch_size // args.gradient_accumulation_steps, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max, sampler=sampler_type, cpu_distributed_training=cpu_distributed_training) eval_datasets = [( AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config_eval, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.eval_batch_size, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max ), args.eval_frequency, 'Eval clean', )] if tst_manifest: eval_datasets.append(( AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config_eval, manifest_filepath=tst_manifest, labels=dataset_vocab, batch_size=args.eval_batch_size, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max ), args.test_frequency, 'Test other', )) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None: print_once("loading model from {}".format(args.ckpt)) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=True) args.start_epoch = checkpoint['epoch'] else: args.start_epoch = 0 loss_fn = RNNTLoss(blank=len(ctc_vocab) - 1) N = len(data_layer) if sampler_type == 'default': args.step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) elif sampler_type == 'bucket': args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size ) print_once('-----------------') print_once('Have {0} examples to train on.'.format(N)) print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch)) print_once('-----------------') constant_lr_policy = lambda _: args.lr fn_lr_policy = constant_lr_policy if args.lr_decay: pre_decay_policy = fn_lr_policy fn_lr_policy = lambda s: lr_decay(args.num_epochs * args.step_per_epoch, s, pre_decay_policy(s)) if args.lr_warmup: pre_warmup_policy = fn_lr_policy fn_lr_policy = lambda s: lr_warmup(args.lr_warmup, s, pre_warmup_policy(s) ) if args.optimizer_kind == "novograd": optimizer = Novograd(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_kind == "adam": optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind)) if args.cuda and optim_level in AmpOptimizations: assert False, "not supported in ipex" if args.ckpt is not None: optimizer.load_state_dict(checkpoint['optimizer']) if args.ipex: if args.bf16: model, optimizer = ipex.optimize(model, dtype=torch.bfloat16, optimizer=optimizer) ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model) else: model, optimizer = ipex.optimize(model, dtype=torch.float32, optimizer=optimizer) ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model) if args.world_size > 1: device_ids = None model = torch.nn.parallel.DistributedDataParallel(model, device_ids=device_ids) print_once(model) print_once("# parameters: {}".format(sum(p.numel() for p in model.parameters()))) greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) if args.tb_path and args.local_rank == 0: logger = TensorBoardLogger(args.tb_path, model.module if multi_gpu else model, args.histogram) else: logger = DummyLogger() train( data_layer=data_layer, model=model, loss_fn=loss_fn, greedy_decoder=greedy_decoder, optimizer=optimizer, data_transforms=train_transforms, labels=ctc_vocab, optim_level=optim_level, multi_gpu=multi_gpu, fn_lr_policy=fn_lr_policy, evalutaion=evaluator(model, eval_transforms, loss_fn, greedy_decoder, ctc_vocab, eval_datasets, logger), logger=logger, args=args)
def __init__(self, config_toml, checkpoint_path, dataset_dir, manifest_filepath, perf_count, total_query_count, scenario, machine_conf, batch_size=1, cores_for_loadgen=0, cores_per_instance=1, enable_debug=False, cosim=False, profile=False, ipex=False, bf16=False, warmup=False): ### multi instance attributes self.batch_size = batch_size self.cores_for_loadgen = cores_for_loadgen self.cores_per_instance = cores_per_instance self.num_cores = get_num_cores() self.lock = mp.Lock() self.init_counter = mp.Value("i", 0) self.output_queue = mp.Queue() self.input_queue = mp.JoinableQueue() self.cosim = cosim self.ipex = ipex self.bf16 = bf16 self.warmup = warmup self.scenario = scenario #server-specific self.num_queues = None self.core_count_list = [] self.num_instance_list = [] self.seq_cutoff_list = [] self.batch_size_list = [] self.input_queue_list = [] self.total_query_count = total_query_count if self.scenario == "Server": # read config self.read_machine_conf(machine_conf) # create queue list for _ in range(self.num_queues): self.input_queue_list.append(mp.JoinableQueue()) config = toml.load(config_toml) dataset_vocab = config['labels']['labels'] rnnt_vocab = add_blank_label(dataset_vocab) featurizer_config = config['input_eval'] self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) self.qsl = AudioQSLInMemory(dataset_dir, manifest_filepath, dataset_vocab, featurizer_config["sample_rate"], perf_count) if self.scenario == "Offline": self.issue_queue = InQueue(self.input_queue, batch_size) elif self.scenario == "Server": self.issue_queue = InQueueServer(self.input_queue_list, self.qsl, self.seq_cutoff_list, self.batch_size_list, self.total_query_count) ### worker process self.consumers = [] cur_core_idx = self.cores_for_loadgen rank = 0 if self.scenario == "Offline": while cur_core_idx + self.cores_per_instance <= self.num_cores: self.consumers.append( Consumer(self.input_queue, self.output_queue, self.lock, self.init_counter, rank, cur_core_idx, cur_core_idx + self.cores_per_instance - 1, self.num_cores, self.qsl, config_toml, checkpoint_path, dataset_dir, manifest_filepath, perf_count, cosim, profile, ipex, bf16, warmup)) rank += 1 cur_core_idx += self.cores_per_instance elif self.scenario == "Server": for i in range(self.num_queues): curr_cores_per_instance = self.core_count_list[i] for _ in range(self.num_instance_list[i]): self.consumers.append( Consumer(self.input_queue_list[i], self.output_queue, self.lock, self.init_counter, rank, cur_core_idx, cur_core_idx + curr_cores_per_instance - 1, self.num_cores, self.qsl, config_toml, checkpoint_path, dataset_dir, manifest_filepath, perf_count, cosim, profile, ipex, bf16, warmup)) rank += 1 cur_core_idx += curr_cores_per_instance self.num_instances = len(self.consumers) ### start worker process for c in self.consumers: c.start() ### wait until all sub processes are ready block_until(self.init_counter, self.num_instances, 2) ### start response thread self.response_worker = threading.Thread(target=response_loadgen, args=(self.output_queue, )) self.response_worker.daemon = True self.response_worker.start() ### debug global debug debug = enable_debug
def run(self): core_list = range(self.start_core, self.end_core + 1) num_cores = len(core_list) os.sched_setaffinity(self.pid, core_list) cmd = "taskset -p -c %d-%d %d" % (self.start_core, self.end_core, self.pid) print(cmd) os.system(cmd) os.environ['OMP_NUM_THREADS'] = '{}'.format(self.end_core - self.start_core + 1) print("### set rank {} to cores [{}:{}]; omp num threads = {}".format( self.rank, self.start_core, self.end_core, num_cores)) torch.set_num_threads(num_cores) if not self.model_init: print("lazy_init rank {}".format(self.rank)) config = toml.load(self.config_toml) dataset_vocab = config['labels']['labels'] rnnt_vocab = add_blank_label(dataset_vocab) featurizer_config = config['input_eval'] self.audio_preprocessor = AudioPreprocessing(**featurizer_config) self.audio_preprocessor.eval() self.audio_preprocessor = torch.jit.script(self.audio_preprocessor) self.audio_preprocessor = torch.jit._recursive.wrap_cpp_module( torch._C._freeze_module(self.audio_preprocessor._c)) model = RNNT(feature_config=featurizer_config, rnnt=config['rnnt'], num_classes=len(rnnt_vocab)) checkpoint = torch.load(self.checkpoint_path, map_location="cpu") migrated_state_dict = {} for key, value in checkpoint['state_dict'].items(): key = key.replace("joint_net", "joint.net") migrated_state_dict[key] = value del migrated_state_dict["audio_preprocessor.featurizer.fb"] del migrated_state_dict["audio_preprocessor.featurizer.window"] model.load_state_dict(migrated_state_dict, strict=True) if self.ipex: import intel_pytorch_extension as ipex if self.bf16: ipex.enable_auto_mixed_precision( mixed_dtype=torch.bfloat16) ipex.core.enable_auto_dnnl() model = model.to(ipex.DEVICE) model.eval() if not self.ipex: model.encoder = torch.jit.script(model.encoder) model.encoder = torch.jit._recursive.wrap_cpp_module( torch._C._freeze_module(model.encoder._c)) model.prediction = torch.jit.script(model.prediction) model.prediction = torch.jit._recursive.wrap_cpp_module( torch._C._freeze_module(model.prediction._c)) model.joint = torch.jit.script(model.joint) model.joint = torch.jit._recursive.wrap_cpp_module( torch._C._freeze_module(model.joint._c)) if not self.ipex: model = torch.jit.script(model) self.greedy_decoder = ScriptGreedyDecoder( len(rnnt_vocab) - 1, model) self.model_init = True if self.warmup: self.do_warmup() self.lock.acquire() self.init_counter.value += 1 self.lock.release() if self.rank == 0 and self.cosim: print('Running with cosim mode, performance will be slow!!!') if self.rank == 0 and self.profile: print('Start profiler') with profiler.profile(record_shapes=True) as prof: self.run_queue(debug=True) print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=20)) print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20)) print( prof.key_averages(group_by_input_shape=True).table( sort_by="self_cpu_time_total", row_limit=40)) print( prof.key_averages(group_by_input_shape=True).table( sort_by="cpu_time_total", row_limit=40)) while self.run_queue(): pass else: while self.run_queue(): pass