def main(): parser = argparse.ArgumentParser(description='Find and test local project files.') parser.add_argument('--parser', choices=['normal', 'fast', 'classic'], default='normal') parser.add_argument('-f', '--find', metavar='PATH', help='find local project files') parser.add_argument('-t', '--test', action='store_true', help='run all tests') parser.add_argument('-s', '--start-index', action='store', type=int, dest='start_index', default=0) parser.add_argument('-n', '--max-files', action='store', type=int, dest='max_files', help='maximum number of files to process') parser.add_argument('-d', '--disable-parallel', action='store_true', help='do not run tests in parallel') parser.add_argument('--diff', choices=['unified', 'html', 'opendiff'], default='opendiff', help='how to display the diffs') parser.add_argument('--reportstats', action='store_true', help='print performance statistics') parser.add_argument('--profile', action='store_true', help='run everything through the profiler') args = parser.parse_args() num_actions = 0 actions = 'find test'.split() for act in actions: if getattr(args, act): num_actions += 1 if num_actions != 1: parser.error('Please specify exactly one of the options %s.' % ', '.join('--' + x for x in actions)) if args.profile: print('Profiling...') utils.profile('call_command(args, parser)', locals(), globals()) else: call_command(args, parser)
def main(): parser = argparse.ArgumentParser( description='Find and test local project files.') parser.add_argument('--parser', choices=['normal', 'fast', 'classic'], default='normal') parser.add_argument('-f', '--find', metavar='PATH', help='find local project files') parser.add_argument('-t', '--test', action='store_true', help='run all tests') parser.add_argument('-s', '--start-index', action='store', type=int, dest='start_index', default=0) parser.add_argument('-n', '--max-files', action='store', type=int, dest='max_files', help='maximum number of files to process') parser.add_argument('-d', '--disable-parallel', action='store_true', help='do not run tests in parallel') parser.add_argument('--diff', choices=['unified', 'html', 'opendiff'], default='opendiff', help='how to display the diffs') parser.add_argument('--reportstats', action='store_true', help='print performance statistics') parser.add_argument('--profile', action='store_true', help='run everything through the profiler') args = parser.parse_args() num_actions = 0 actions = 'find test'.split() for act in actions: if getattr(args, act): num_actions += 1 if num_actions != 1: parser.error('Please specify exactly one of the options %s.' % ', '.join('--' + x for x in actions)) if args.profile: print('Profiling...') utils.profile('call_command(args, parser)', locals(), globals()) else: call_command(args, parser)
def main(): parser = argparse.ArgumentParser(description='Show some histograms for a directory a Xcode project files.') parser.add_argument('-u', '--utcoffset', type=int, default=-8, metavar='UTCOFFSET', help='UTC time offset, e.g. "-8" for California') parser.add_argument('--startyear', type=int, default=2006) parser.add_argument('--endyear', type=int, default=2014) parser.add_argument('-n', '--max-files', action='store', type=int, default=None, help='maximum number of files to process') parser.add_argument('--max-firstnames', action='store', type=int, default=None, help='maximum number first names to consider') parser.add_argument('--emoji', action='store_true', help='add emoji characters to userhashes') parser.add_argument('--emojitable', action='store_true', help='only print the emoji table') parser.add_argument('--profile', action='store_true', help='run everything through the profiler') parser.add_argument('directory', help='directory with Xcode project files') args = parser.parse_args() if args.profile: write('Profiling...') utils.profile('call_command(args, parser)', locals(), globals()) else: call_command(args)
def main(): from utils import (init_torch_seeds, model_info, profile, profile_training) init_torch_seeds(seed=1234) # analyze backbone characterstics of different models model_builders = [ models.resnet18, models.resnet50, models.vgg16, models.shufflenet_v2_x2_0, models.mobilenet_v2, Yolov5, ghostnet, ][-2:] for model_builder in model_builders: print(f'{10*"-"} {model_builder.__name__} {10*"-"}') model = get_backbone(model_builder, pretrained=False) model_info(model, verbose=False, img_size=512) profile(model, verbose=True, amp=True) profile_training(model, amp=True) '''
for Base in [nn.Conv2d, DeformConv, SpatiallyConv, DepthwiseConv, FlattenedConv, GroupedConv, ShuffledGroupedConv]: # change 'BASE' class for 'Conv' wrapper class convs.BASE = Base if 'group' in Base.__name__.lower(): convs.GROUPS = 8 else: convs.GROUPS = 1 print(f'BASE: {convs.BASE.__name__}, GROUPS: {convs.GROUPS}') model = centernet(heads={'cpt_hm': 30, 'cpt_off': 2, 'wh': 2}) model.info() # summary try: profile(model) # timing model.fuse() # fuse and print summary again profile(model) # fuse timing profile_training(model) # forward + backward timing/memory except Exception as e: print(e) """ PyTorch version 1.6.0 CUDA version 10.2 cuDNN version 7605 cuDNN deterministic False cuDNN benchmark True BASE: Conv2d, GROUPS: 1 Model Summary: 260 layers, 17.9M parameters, 17.9M gradients, 62.1 GFLOPs Input size: torch.Size([1, 3, 512, 512])
def train(data_train, model, nsp_loss, mlm_loss, vocab_size, ctx, store): """Training function.""" mlm_metric = nlp.metric.MaskedAccuracy() nsp_metric = nlp.metric.MaskedAccuracy() mlm_metric.reset() nsp_metric.reset() lr = args.lr optim_params = {'learning_rate': lr, 'epsilon': 1e-6, 'wd': 0.01} if args.dtype == 'float16': optim_params['multi_precision'] = True trainer = mx.gluon.Trainer(model.collect_params(), 'bertadam', optim_params, update_on_kvstore=False, kvstore=store) dynamic_loss_scale = args.dtype == 'float16' fp16_trainer = FP16Trainer(trainer, dynamic_loss_scale=dynamic_loss_scale) if args.start_step: state_path = os.path.join(args.ckpt_dir, '%07d.states.%02d' % (args.start_step, 0)) logging.info('Loading trainer state from %s', state_path) nlp.utils.load_states(trainer, state_path) accumulate = args.accumulate num_train_steps = args.num_steps warmup_ratio = args.warmup_ratio num_warmup_steps = int(num_train_steps * warmup_ratio) params = [ p for p in model.collect_params().values() if p.grad_req != 'null' ] param_dict = model.collect_params() # Do not apply weight decay on LayerNorm and bias terms for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if accumulate > 1: for p in params: p.grad_req = 'add' train_begin_time = time.time() begin_time = time.time() running_mlm_loss = running_nsp_loss = running_num_tks = 0 batch_num = 0 step_num = args.start_step parallel_model = ParallelBERT(model, mlm_loss, nsp_loss, vocab_size, store.num_workers * accumulate, trainer=fp16_trainer) num_ctxes = len(ctx) parallel = nlp.utils.Parallel(num_ctxes if num_ctxes > 1 else 0, parallel_model) while step_num < num_train_steps: for _, dataloader in enumerate(data_train): if step_num >= num_train_steps: break # create dummy data loader if needed if args.dummy_data_len: target_shape = (args.batch_size * num_ctxes, args.dummy_data_len) dataloader = get_dummy_dataloader(dataloader, target_shape) for _, data_batch in enumerate(dataloader): if step_num >= num_train_steps: break if batch_num % accumulate == 0: step_num += 1 # if accumulate > 1, grad_req is set to 'add', and zero_grad is required if accumulate > 1: param_dict.zero_grad() # update learning rate if step_num <= num_warmup_steps: new_lr = lr * step_num / num_warmup_steps else: offset = lr * step_num / num_train_steps new_lr = lr - offset trainer.set_learning_rate(new_lr) if args.profile: profile(step_num, 10, 12, profile_name=args.profile) if args.use_avg_len: data_list = [[seq.as_in_context(context) for seq in shard] for context, shard in zip(ctx, data_batch)] else: if data_batch[0].shape[0] < len(ctx): continue data_list = split_and_load(data_batch, ctx) ns_label_list, ns_pred_list = [], [] mask_label_list, mask_pred_list, mask_weight_list = [], [], [] # parallel forward / backward for data in data_list: parallel.put(data) for _ in range(len(ctx)): (_, next_sentence_label, classified, masked_id, decoded, masked_weight, ls1, ls2, valid_length) = parallel.get() ns_label_list.append(next_sentence_label) ns_pred_list.append(classified) mask_label_list.append(masked_id) mask_pred_list.append(decoded) mask_weight_list.append(masked_weight) running_mlm_loss += ls1.as_in_context(mx.cpu()) / num_ctxes running_nsp_loss += ls2.as_in_context(mx.cpu()) / num_ctxes running_num_tks += valid_length.sum().as_in_context( mx.cpu()) # update if (batch_num + 1) % accumulate == 0: fp16_trainer.step(1, max_norm=1) nsp_metric.update(ns_label_list, ns_pred_list) mlm_metric.update(mask_label_list, mask_pred_list, mask_weight_list) # logging if (step_num + 1) % (args.log_interval) == 0 and ( batch_num + 1) % accumulate == 0: log(begin_time, running_num_tks, running_mlm_loss / accumulate, running_nsp_loss / accumulate, step_num, mlm_metric, nsp_metric, trainer, args.log_interval) begin_time = time.time() running_mlm_loss = running_nsp_loss = running_num_tks = 0 mlm_metric.reset_local() nsp_metric.reset_local() # saving checkpoints if (step_num + 1) % args.ckpt_interval == 0 \ and (batch_num + 1) % accumulate == 0 and store.rank == 0: save_states(step_num, trainer, args.ckpt_dir) save_parameters(step_num, model, args.ckpt_dir) batch_num += 1 if store.rank == 0: save_states(step_num, trainer, args.ckpt_dir) save_parameters(step_num, model, args.ckpt_dir) mx.nd.waitall() train_end_time = time.time() logging.info('Train cost={:.1f}s'.format(train_end_time - train_begin_time))
def profile_internal(e, o): out, result = profile(app)(e, o) return list(out) + ["<pre>" + net.websafe(result) + "</pre>"]
def compute_solution(self, state, prev_move=None, callback=None): if state == RubiksCube.SOLVED_STR: solution = [] elif prev_move is not None and len( self._solution) > 1 and self._solution[0] == prev_move: solution = self._solution[1:] else: solution = solve(state) self._solution = solution if callback is not None: callback(solution) if __name__ == '__main__': c = RubiksCube() with profile(True): for i in range(10): c.shuffle() # c.pprint() try: moves = solve(c.state_string) # for m in moves.split(" "): # c.move(m) # c.pprint() # print("Number of moves", len(moves)) except (Exception, KeyboardInterrupt) as e: print(e, f"Cube state: {c.state_string}") c.pprint() raise e
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu #if args.gpu is not None: # print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.debug: for k,v in model.named_modules(): print(k) return if args.show: input_data = torch.randn([1,3,224,224]) summary(model.cuda(),(3,224,224)) model = model.cpu() with SummaryWriter(log_dir='./log',comment='resnet18') as w: w.add_graph(model,(input_data)) return if args.flops: input_data = torch.randn([1,3,224,224]) flops, params = profile(model,inputs=(input_data, )) print(flops) print("flops,:{},params:{}".format(clever_format(flops), params)) return if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume_normal: if os.path.isfile(args.resume_normal): print("=> loading checkpoint '{}'".format(args.resume_normal)) checkpoint = torch.load(args.resume_normal) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format(args.resume_normal, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume_normal)) elif args.resume_from: # increse channel_removed_ratio as FBS if os.path.isfile(args.resume_from): if not args.lasso: print("=> loading pretrained model '{}'".format(args.resume_from)) print("=> increase channel removed ratio to '{}'".format(args.channel_removed_ratio)) checkpoint = torch.load(args.resume_from) args.start_epoch = 0 model.load_state_dict(checkpoint['state_dict']) print("=> loaded pretrained model '{}' (epoch {})".format(args.resume_from, args.start_epoch)) elif args.lasso: print("=> loading pretrained model '{}'".format(args.resume_from)) print("=> increase channel removed ratio to '{}'".format(args.channel_removed_ratio)) checkpoint = torch.load(args.resume_from) args.start_epoch = 0 oldmodel = checkpoint['state_dict'] #for k,v in oldmodel.items(): # print(k) for key,value in model.state_dict().items(): if "channel_l1" in key: continue if "spatial_l1" in key: continue value.copy_(oldmodel[key]) print("=> loaded pretrained model '{}' (epoch {})".format(args.resume_from, args.start_epoch)) #return cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best) log.close()
def train(data_train, data_eval, model, nsp_loss, mlm_loss, vocab_size, ctx): """Training function.""" hvd.broadcast_parameters(model.collect_params(), root_rank=0) mlm_metric = nlp.metric.MaskedAccuracy() nsp_metric = nlp.metric.MaskedAccuracy() mlm_metric.reset() nsp_metric.reset() logging.debug('Creating distributed trainer...') lr = args.lr optim_params = {'learning_rate': lr, 'epsilon': 1e-6, 'wd': 0.01} if args.dtype == 'float16': optim_params['multi_precision'] = True dynamic_loss_scale = args.dtype == 'float16' if dynamic_loss_scale: loss_scale_param = {'scale_window': 2000 / num_workers} else: loss_scale_param = None trainer = hvd.DistributedTrainer(model.collect_params(), 'bertadam', optim_params) fp16_trainer = FP16Trainer(trainer, dynamic_loss_scale=dynamic_loss_scale, loss_scaler_params=loss_scale_param) if args.start_step: state_path = os.path.join(args.ckpt_dir, '%07d.states.%02d'%(args.start_step, local_rank)) logging.info('Loading trainer state from %s', state_path) nlp.utils.load_states(trainer, state_path) accumulate = args.accumulate num_train_steps = args.num_steps warmup_ratio = args.warmup_ratio num_warmup_steps = int(num_train_steps * warmup_ratio) params = [p for p in model.collect_params().values() if p.grad_req != 'null'] param_dict = model.collect_params() # Do not apply weight decay on LayerNorm and bias terms for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if accumulate > 1: for p in params: p.grad_req = 'add' train_begin_time = time.time() begin_time = time.time() running_mlm_loss, running_nsp_loss = 0, 0 running_num_tks = 0 batch_num = 0 step_num = args.start_step logging.debug('Training started') while step_num < num_train_steps: for _, dataloader in enumerate(data_train): if step_num >= num_train_steps: break # create dummy data loader if needed if args.dummy_data_len: target_shape = (args.batch_size, args.dummy_data_len) dataloader = get_dummy_dataloader(dataloader, target_shape) for _, data_batch in enumerate(dataloader): if step_num >= num_train_steps: break if batch_num % accumulate == 0: step_num += 1 # if accumulate > 1, grad_req is set to 'add', and zero_grad is required if accumulate > 1: param_dict.zero_grad() # update learning rate if step_num <= num_warmup_steps: new_lr = lr * step_num / num_warmup_steps else: offset = lr * step_num / num_train_steps new_lr = lr - offset trainer.set_learning_rate(new_lr) if args.profile: profile(step_num, 10, 14, profile_name=args.profile + str(rank)) # load data if args.use_avg_len: data_list = [[seq.as_in_context(context) for seq in shard] for context, shard in zip([ctx], data_batch)] else: data_list = list(split_and_load(data_batch, [ctx])) data = data_list[0] # forward with mx.autograd.record(): (ls, ns_label, classified, masked_id, decoded, \ masked_weight, ls1, ls2, valid_len) = forward(data, model, mlm_loss, nsp_loss, vocab_size, args.dtype) ls = ls / accumulate # backward if args.dtype == 'float16': fp16_trainer.backward(ls) else: ls.backward() running_mlm_loss += ls1.as_in_context(mx.cpu()) running_nsp_loss += ls2.as_in_context(mx.cpu()) running_num_tks += valid_len.sum().as_in_context(mx.cpu()) # update if (batch_num + 1) % accumulate == 0: # step() performs 3 things: # 1. allreduce gradients from all workers # 2. checking the global_norm of gradients and clip them if necessary # 3. averaging the gradients and apply updates fp16_trainer.step(1, max_norm=1*num_workers) nsp_metric.update([ns_label], [classified]) mlm_metric.update([masked_id], [decoded], [masked_weight]) # logging if (step_num + 1) % (args.log_interval) == 0 and (batch_num + 1) % accumulate == 0: log(begin_time, running_num_tks, running_mlm_loss / accumulate, running_nsp_loss / accumulate, step_num, mlm_metric, nsp_metric, trainer, args.log_interval) begin_time = time.time() running_mlm_loss = running_nsp_loss = running_num_tks = 0 mlm_metric.reset_local() nsp_metric.reset_local() # saving checkpoints if (step_num + 1) % args.ckpt_interval == 0 and (batch_num + 1) % accumulate == 0: if is_master_node: save_states(step_num, trainer, args.ckpt_dir, local_rank) if local_rank == 0: save_parameters(step_num, model, args.ckpt_dir) if data_eval: # eval data is always based on a fixed npz file. dataset_eval = get_pretrain_data_npz(data_eval, args.batch_size_eval, 1, False, False, 1) evaluate(dataset_eval, model, nsp_loss, mlm_loss, len(vocab), [ctx], args.log_interval, args.dtype) batch_num += 1 if is_master_node: save_states(step_num, trainer, args.ckpt_dir, local_rank) if local_rank == 0: save_parameters(step_num, model, args.ckpt_dir) mx.nd.waitall() train_end_time = time.time() logging.info('Train cost={:.1f}s'.format(train_end_time - train_begin_time))
def issue_queries(self, query_samples): def run_one_batch(cur_batch_size=1, base_index=0): inputs_list = [] token_types_list = [] valid_length_list = [] for i in range(cur_batch_size): idx = base_index + i eval_features = self.qsl.get_features(query_samples[idx].index) example_ids, inputs, token_types, valid_length, _, _ = eval_features inputs_list.append(inputs) token_types_list.append(token_types) valid_length_list.append(valid_length) max_len = max([len(inp) for inp in inputs_list]) for i in range(len(inputs_list)): inputs_list[i] += [0] * (max_len - len(inputs_list[i])) token_types_list[i] += [0] * (max_len - len(token_types_list[i])) inputs = mx.nd.array(inputs_list).as_in_context(self.ctx) token_types = mx.nd.array(token_types_list).as_in_context(self.ctx) valid_length = mx.nd.array(valid_length_list).as_in_context( self.ctx).astype('float32') ## run with a batch out = self.net(inputs, token_types, valid_length) out_np = out.asnumpy() out_list = np.split(out_np, cur_batch_size, axis=0) for i, o in enumerate(out_list): idx = base_index + i response_array = array.array( "B", np.array(o).astype(np.float32).tobytes()) bi = response_array.buffer_info() response = lg.QuerySampleResponse(query_samples[idx].id, bi[0], bi[1]) lg.QuerySamplesComplete([response]) num_samples = len(query_samples) if num_samples == 1: eval_features = self.qsl.get_features(query_samples[0].index) example_ids, inputs, token_types, valid_length, _, _ = eval_features inputs = mx.nd.array(inputs).reshape(1, -1) token_types = mx.nd.array(token_types).reshape(1, -1) valid_length = mx.nd.array(valid_length).reshape(-1, ) out = self.net( inputs.as_in_context(self.ctx), token_types.as_in_context(self.ctx), valid_length.as_in_context(self.ctx).astype('float32')) out = out.asnumpy() response_array = array.array( "B", np.array(out).astype(np.float32).tobytes()) bi = response_array.buffer_info() response = lg.QuerySampleResponse(query_samples[0].id, bi[0], bi[1]) lg.QuerySamplesComplete([response]) else: ## TODO, used in batch_size tuning if num_samples < self.batch_size: if self.logger: self.logger.error( 'batch_size {0} is larger than provided samples {1}, consider' ' to decrease batch_size.'.format( self.batch_size, num_samples)) sys.exit(-1) num_batch = num_samples // self.batch_size remaining_batch = num_samples % self.batch_size if self.logger: self.logger.info( 'split the datasets into {0} batches with bs={1} and remaining {2}...' .format(num_batch, self.batch_size, remaining_batch)) start_step = 10 end_step = 30 if num_batch > 30 else num_batch for b in range(num_batch): base_index = b * self.batch_size profile(b, start_step, end_step, profile_name='profile.json', early_exit=False) run_one_batch(self.batch_size, base_index) if remaining_batch > 0: base_index = num_batch * self.batch_size run_one_batch(remaining_batch, base_index)
transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') # Model print('==> Building model..') get_model = model_dict[args.net] net = get_model() print('==> Model:', args.net) flops, params = profile(net, inputs=(torch.randn(1, 3, 32, 32), )) print('* MACs: {:,.2f}'.format(flops).replace('.00', '')) print('* Params: {:,.2f}'.format(params).replace('.00', '')) if torch.cuda.is_available(): device = 'cuda' print('==> cuda is available (gpu)') else: device = 'cpu' print('==> No cuda, running on cpu') net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.resume:
M = mass g = 9.8 return jet.array([ z[1], -1.0/(l+z[2])*(2*z[1]*z[3]+g*(m/2+M)/(m/3+M)*jet.sin(z[0])), z[3], (l+z[2])*z[1]**2+(m/2+M)/(m/3+M)*g*jet.cos(z[0])-1.0/(m/3+M)*k*z[2] ]) #Create time steps time = numpy.linspace(0.0, 10.0, 1e7) #Specify initial conditions init = numpy.array([jet.pi / 2, 0, mass * 9.8 / spring['k'], 0]) # initial values #array([theta, theta_dot, x, x_dot]) profile_derive = lambda: profile(lambda: deriv(init, time[0])) profile_odeint = lambda: profile(lambda: odeint(deriv, init, time)) print('jet_mode = True') print('derivatives: %f, %f' % (profile_derive(), profile_derive())) print('integration: %f' % profile_odeint()) jet.set_options(jet_mode=False) print('---') print('jet_mode = False') print('derivatives: %f' % profile_derive()) print('integration: %f' % profile_odeint())
def profile_internal(e, o): out, result = profile(app)(e, o) return list(out) + ['<pre>' + net.websafe(result) + '</pre>']
while not app.closed: app.draw_frame(cube) dt = clock.tick(30) app.event_hub.raise_event( Event(origin=Event.APPLICATION, type=Event.NEWFRAME, dt=dt)) event_hub.handle_events() def run_controls_ui(): dash.mainloop() if __name__ == '__main__': with profile(on=False): event_hub = EventsHub() cube = RubiksCubeDrawer(event_hub) cube.state.pprint() # cube.load_state(RubiksCube.SOLVED_STR) camera = Camera(event_hub) app = OpenGLApp(event_hub) dash = Dashboard(event_hub) tCube = Thread(target=run_cube_sim) tCube.start() run_controls_ui() # Tkinter needs to be called from main thread tCube.join()
def run(self): global batching #os.sched_setaffinity(self.pid, self.affinity) cmd = "taskset -p -c %d-%d %d" % (self.start_core_idx, self.end_core_idx, self.pid) print(cmd) os.system(cmd) import mxnet as mx ctx = mx.cpu() #from numexpr.utils import set_num_threads #set_num_threads(28) os.environ['OMP_NUM_THREADS'] = '{}'.format(self.end_core_idx - self.start_core_idx + 1) model = BERTModel(mx.cpu(), self.args.vocab, self.args.params, self.args.quantized, self.args.quantized_model_prefix) data_set = BERTDataSet(self.args.vocab, self.args.perf_count) self.lock.acquire() self.calibrate_counter.value += 1 self.lock.release() block_until(self.calibrate_counter, self.world_size) if self.args.perf_calibrate: self.calibrate(model, data_set, ctx) return self.lock.acquire() self.calibrate_counter.value += 1 self.lock.release() if self.args.warmup: self.warmup(model, data_set, ctx, self.args.scenario) self.lock.acquire() self.init_counter.value += 1 self.lock.release() #affinity = os.sched_getaffinity(self.pid) #print('Process', self.pid, 'affinity proc list:', affinity) cur_step = 0 start_step = 384 end_step = -1 from utils import profile while True: next_task = self.task_queue.get() #(self.proc_idx) if next_task is None: # None means shutdown log.info('Exiting {}-pid:{}, cur_step={}'.format( self.name, self.pid, cur_step)) self.task_queue.task_done() if self.args.profile and self.proc_idx == 0: if end_step == -1: end_step = cur_step profile(cur_step, start_step, end_step, profile_name='profile_{}.json'.format(self.pid), early_exit=False) break query_id_list = next_task.query_id_list sample_index_list = next_task.sample_index_list batch_size = len(sample_index_list) #print ('pid-{}, query_id_list: {}, sample_index_list: {}'.format(self.pid, query_id_list, sample_index_list)) inputs_list = [] token_types_list = [] valid_length_list = [] for sample_index in sample_index_list: eval_feature = data_set.eval_features[sample_index] _, inputs, token_types, valid_length, _, _ = eval_feature inputs_list.append(inputs) token_types_list.append(token_types) valid_length_list.append(valid_length) if len(inputs_list) > 1: max_len = max([len(inp) for inp in inputs_list]) new_max_len, bs, best_throughput = get_best_bs(max_len) if bs == len(inputs_list): max_len = new_max_len #for i in range(len(inputs_list)): # inputs_list[i] += [0] * (max_len - len(inputs_list[i])) # token_types_list[i] += [0] * (max_len - len(token_types_list[i])) else: max_len = self.max_pad_len #len(inputs_list[0]) #self.max_pad_len #len(inputs_list) for i in range(len(inputs_list)): inputs_list[i] += [0] * (max_len - len(inputs_list[i])) token_types_list[i] += [0] * (max_len - len(token_types_list[i])) inputs = mx.nd.array(inputs_list).as_in_context(ctx) token_types = mx.nd.array(token_types_list).as_in_context(ctx) valid_length = mx.nd.array(valid_length_list).as_in_context( ctx).astype('float32') if self.args.profile and self.proc_idx == 0: profile(cur_step, start_step, end_step, profile_name='profile_{}.json'.format(self.pid), early_exit=False) cur_step += 1 #t0 = time.time() out = model.net(inputs, token_types, valid_length) out_np = out.asnumpy() #t1 = time.time() #if self.proc_idx == 0: # cur_throughput = len(inputs_list)/(t1-t0) # if best_throughput != 0: # throughput_diff = (cur_throughput - best_throughput) / best_throughput # print ('inference seq len = {} BS = {} throughput = {:.5f} ({:.3f}%)'.format(max_len, len(inputs_list), cur_throughput, throughput_diff*100)) # else: # print ('inference seq len = {} BS = {} throughput = {:.5f})'.format(max_len, len(inputs_list), cur_throughput)) result = Output(query_id_list, out_np) self.result_queue.put(result) #print('consumer-{}: output.shape={}, query_id={}'.format(self.pid, out_np.shape, query_id_list[0])) self.task_queue.task_done()
import FaultyMemory as FyM import torch from utils import profile import torchvision.models as models device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Running on {device}") resnet18 = models.resnet18(pretrained=True).to(device) dummy_tensor = torch.randn([32, 3, 32, 32]).to(device) representation = FyM.SlowFixedPointRepresentation() def inference_parameters(): handler = FyM.Handler(resnet18) handler.add_net_parameters(representation) handler(dummy_tensor) _ = profile(inference_parameters, __file__, device)