def main():
    parser = argparse.ArgumentParser(description='Find and test local project files.')
    parser.add_argument('--parser', choices=['normal', 'fast', 'classic'], default='normal')
    parser.add_argument('-f', '--find', metavar='PATH', help='find local project files')
    parser.add_argument('-t', '--test', action='store_true', help='run all tests')
    parser.add_argument('-s', '--start-index', action='store', type=int, dest='start_index', default=0)
    parser.add_argument('-n', '--max-files', action='store', type=int, dest='max_files', help='maximum number of files to process')
    parser.add_argument('-d', '--disable-parallel', action='store_true', help='do not run tests in parallel')
    parser.add_argument('--diff', choices=['unified', 'html', 'opendiff'], default='opendiff',
                        help='how to display the diffs')
    parser.add_argument('--reportstats', action='store_true', help='print performance statistics')
    parser.add_argument('--profile', action='store_true', help='run everything through the profiler')

    args = parser.parse_args()

    num_actions = 0
    actions = 'find test'.split()
    for act in actions:
        if getattr(args, act):
            num_actions += 1

    if num_actions != 1:
        parser.error('Please specify exactly one of the options %s.' % ', '.join('--' + x for x in actions))

    if args.profile:
        print('Profiling...')
        utils.profile('call_command(args, parser)', locals(), globals())
    else:
        call_command(args, parser)
def main():
    parser = argparse.ArgumentParser(
        description='Find and test local project files.')
    parser.add_argument('--parser',
                        choices=['normal', 'fast', 'classic'],
                        default='normal')
    parser.add_argument('-f',
                        '--find',
                        metavar='PATH',
                        help='find local project files')
    parser.add_argument('-t',
                        '--test',
                        action='store_true',
                        help='run all tests')
    parser.add_argument('-s',
                        '--start-index',
                        action='store',
                        type=int,
                        dest='start_index',
                        default=0)
    parser.add_argument('-n',
                        '--max-files',
                        action='store',
                        type=int,
                        dest='max_files',
                        help='maximum number of files to process')
    parser.add_argument('-d',
                        '--disable-parallel',
                        action='store_true',
                        help='do not run tests in parallel')
    parser.add_argument('--diff',
                        choices=['unified', 'html', 'opendiff'],
                        default='opendiff',
                        help='how to display the diffs')
    parser.add_argument('--reportstats',
                        action='store_true',
                        help='print performance statistics')
    parser.add_argument('--profile',
                        action='store_true',
                        help='run everything through the profiler')

    args = parser.parse_args()

    num_actions = 0
    actions = 'find test'.split()
    for act in actions:
        if getattr(args, act):
            num_actions += 1

    if num_actions != 1:
        parser.error('Please specify exactly one of the options %s.' %
                     ', '.join('--' + x for x in actions))

    if args.profile:
        print('Profiling...')
        utils.profile('call_command(args, parser)', locals(), globals())
    else:
        call_command(args, parser)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(description='Show some histograms for a directory a Xcode project files.')
    parser.add_argument('-u', '--utcoffset', type=int, default=-8, metavar='UTCOFFSET', help='UTC time offset, e.g. "-8" for California')
    parser.add_argument('--startyear', type=int, default=2006)
    parser.add_argument('--endyear', type=int, default=2014)
    parser.add_argument('-n', '--max-files', action='store', type=int, default=None, help='maximum number of files to process')
    parser.add_argument('--max-firstnames', action='store', type=int, default=None, help='maximum number first names to consider')
    parser.add_argument('--emoji', action='store_true', help='add emoji characters to userhashes')
    parser.add_argument('--emojitable', action='store_true', help='only print the emoji table')
    parser.add_argument('--profile', action='store_true', help='run everything through the profiler')
    parser.add_argument('directory', help='directory with Xcode project files')

    args = parser.parse_args()

    if args.profile:
        write('Profiling...')
        utils.profile('call_command(args, parser)', locals(), globals())
    else:
        call_command(args)
def main():
    from utils import (init_torch_seeds, model_info, profile, profile_training)

    init_torch_seeds(seed=1234)

    # analyze backbone characterstics of different models
    model_builders = [
        models.resnet18,
        models.resnet50,
        models.vgg16,
        models.shufflenet_v2_x2_0,
        models.mobilenet_v2,
        Yolov5,
        ghostnet,
    ][-2:]

    for model_builder in model_builders:
        print(f'{10*"-"} {model_builder.__name__} {10*"-"}')
        model = get_backbone(model_builder, pretrained=False)
        model_info(model, verbose=False, img_size=512)
        profile(model, verbose=True, amp=True)
        profile_training(model, amp=True)
    '''
Beispiel #5
0
 for Base in [nn.Conv2d, DeformConv, SpatiallyConv, 
     DepthwiseConv, FlattenedConv, GroupedConv, ShuffledGroupedConv]:
     
     # change 'BASE' class for 'Conv' wrapper class
     convs.BASE = Base
     if 'group' in Base.__name__.lower():
         convs.GROUPS = 8
     else:
         convs.GROUPS = 1
         
     print(f'BASE: {convs.BASE.__name__}, GROUPS: {convs.GROUPS}')
     model = centernet(heads={'cpt_hm': 30, 'cpt_off': 2, 'wh': 2})
     model.info()  # summary
     
     try:
         profile(model)  # timing
         model.fuse()  # fuse and print summary again
         profile(model)  # fuse timing
         profile_training(model)  # forward + backward timing/memory
     except Exception as e:
         print(e)
         
 """
 PyTorch version 1.6.0
 CUDA version 10.2
 cuDNN version 7605
 cuDNN deterministic False
 cuDNN benchmark True
 BASE: Conv2d, GROUPS: 1
 Model Summary: 260 layers, 17.9M parameters, 17.9M gradients, 62.1 GFLOPs
 Input size: torch.Size([1, 3, 512, 512])
Beispiel #6
0
def train(data_train, model, nsp_loss, mlm_loss, vocab_size, ctx, store):
    """Training function."""
    mlm_metric = nlp.metric.MaskedAccuracy()
    nsp_metric = nlp.metric.MaskedAccuracy()
    mlm_metric.reset()
    nsp_metric.reset()

    lr = args.lr
    optim_params = {'learning_rate': lr, 'epsilon': 1e-6, 'wd': 0.01}
    if args.dtype == 'float16':
        optim_params['multi_precision'] = True

    trainer = mx.gluon.Trainer(model.collect_params(),
                               'bertadam',
                               optim_params,
                               update_on_kvstore=False,
                               kvstore=store)
    dynamic_loss_scale = args.dtype == 'float16'
    fp16_trainer = FP16Trainer(trainer, dynamic_loss_scale=dynamic_loss_scale)

    if args.start_step:
        state_path = os.path.join(args.ckpt_dir,
                                  '%07d.states.%02d' % (args.start_step, 0))
        logging.info('Loading trainer state from %s', state_path)
        nlp.utils.load_states(trainer, state_path)

    accumulate = args.accumulate
    num_train_steps = args.num_steps
    warmup_ratio = args.warmup_ratio
    num_warmup_steps = int(num_train_steps * warmup_ratio)
    params = [
        p for p in model.collect_params().values() if p.grad_req != 'null'
    ]
    param_dict = model.collect_params()

    # Do not apply weight decay on LayerNorm and bias terms
    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    if accumulate > 1:
        for p in params:
            p.grad_req = 'add'

    train_begin_time = time.time()
    begin_time = time.time()
    running_mlm_loss = running_nsp_loss = running_num_tks = 0
    batch_num = 0
    step_num = args.start_step

    parallel_model = ParallelBERT(model,
                                  mlm_loss,
                                  nsp_loss,
                                  vocab_size,
                                  store.num_workers * accumulate,
                                  trainer=fp16_trainer)
    num_ctxes = len(ctx)
    parallel = nlp.utils.Parallel(num_ctxes if num_ctxes > 1 else 0,
                                  parallel_model)

    while step_num < num_train_steps:
        for _, dataloader in enumerate(data_train):
            if step_num >= num_train_steps:
                break

            # create dummy data loader if needed
            if args.dummy_data_len:
                target_shape = (args.batch_size * num_ctxes,
                                args.dummy_data_len)
                dataloader = get_dummy_dataloader(dataloader, target_shape)

            for _, data_batch in enumerate(dataloader):
                if step_num >= num_train_steps:
                    break
                if batch_num % accumulate == 0:
                    step_num += 1
                    # if accumulate > 1, grad_req is set to 'add', and zero_grad is required
                    if accumulate > 1:
                        param_dict.zero_grad()
                    # update learning rate
                    if step_num <= num_warmup_steps:
                        new_lr = lr * step_num / num_warmup_steps
                    else:
                        offset = lr * step_num / num_train_steps
                        new_lr = lr - offset
                    trainer.set_learning_rate(new_lr)
                    if args.profile:
                        profile(step_num, 10, 12, profile_name=args.profile)
                if args.use_avg_len:
                    data_list = [[seq.as_in_context(context) for seq in shard]
                                 for context, shard in zip(ctx, data_batch)]
                else:
                    if data_batch[0].shape[0] < len(ctx):
                        continue
                    data_list = split_and_load(data_batch, ctx)

                ns_label_list, ns_pred_list = [], []
                mask_label_list, mask_pred_list, mask_weight_list = [], [], []

                # parallel forward / backward
                for data in data_list:
                    parallel.put(data)
                for _ in range(len(ctx)):
                    (_, next_sentence_label, classified, masked_id, decoded,
                     masked_weight, ls1, ls2, valid_length) = parallel.get()
                    ns_label_list.append(next_sentence_label)
                    ns_pred_list.append(classified)
                    mask_label_list.append(masked_id)
                    mask_pred_list.append(decoded)
                    mask_weight_list.append(masked_weight)
                    running_mlm_loss += ls1.as_in_context(mx.cpu()) / num_ctxes
                    running_nsp_loss += ls2.as_in_context(mx.cpu()) / num_ctxes
                    running_num_tks += valid_length.sum().as_in_context(
                        mx.cpu())

                # update
                if (batch_num + 1) % accumulate == 0:
                    fp16_trainer.step(1, max_norm=1)
                nsp_metric.update(ns_label_list, ns_pred_list)
                mlm_metric.update(mask_label_list, mask_pred_list,
                                  mask_weight_list)
                # logging
                if (step_num + 1) % (args.log_interval) == 0 and (
                        batch_num + 1) % accumulate == 0:
                    log(begin_time, running_num_tks,
                        running_mlm_loss / accumulate,
                        running_nsp_loss / accumulate, step_num, mlm_metric,
                        nsp_metric, trainer, args.log_interval)
                    begin_time = time.time()
                    running_mlm_loss = running_nsp_loss = running_num_tks = 0
                    mlm_metric.reset_local()
                    nsp_metric.reset_local()

                # saving checkpoints
                if (step_num + 1) % args.ckpt_interval == 0 \
                   and (batch_num + 1) % accumulate == 0 and store.rank == 0:
                    save_states(step_num, trainer, args.ckpt_dir)
                    save_parameters(step_num, model, args.ckpt_dir)
                batch_num += 1
    if store.rank == 0:
        save_states(step_num, trainer, args.ckpt_dir)
        save_parameters(step_num, model, args.ckpt_dir)
    mx.nd.waitall()
    train_end_time = time.time()
    logging.info('Train cost={:.1f}s'.format(train_end_time -
                                             train_begin_time))
Beispiel #7
0
 def profile_internal(e, o):
     out, result = profile(app)(e, o)
     return list(out) + ["<pre>" + net.websafe(result) + "</pre>"]
Beispiel #8
0
    def compute_solution(self, state, prev_move=None, callback=None):
        if state == RubiksCube.SOLVED_STR:
            solution = []
        elif prev_move is not None and len(
                self._solution) > 1 and self._solution[0] == prev_move:
            solution = self._solution[1:]
        else:
            solution = solve(state)
        self._solution = solution
        if callback is not None:
            callback(solution)


if __name__ == '__main__':
    c = RubiksCube()
    with profile(True):
        for i in range(10):
            c.shuffle()
            # c.pprint()
            try:
                moves = solve(c.state_string)
                # for m in moves.split(" "):
                #     c.move(m)
                # c.pprint()
                # print("Number of moves", len(moves))
            except (Exception, KeyboardInterrupt) as e:
                print(e, f"Cube state: {c.state_string}")
                c.pprint()
                raise e
Beispiel #9
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu 
    #if args.gpu is not None:
    #    print("Use GPU: {} for training".format(args.gpu))
    
    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
        
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()
       
    if args.debug:
        for k,v in model.named_modules():
            print(k)
        return
    if args.show:
        input_data = torch.randn([1,3,224,224])
        summary(model.cuda(),(3,224,224))
        model = model.cpu()
        with SummaryWriter(log_dir='./log',comment='resnet18') as w:
            w.add_graph(model,(input_data))
        return 
    if args.flops:
        input_data = torch.randn([1,3,224,224])
        flops, params = profile(model,inputs=(input_data, ))
        print(flops)
        print("flops,:{},params:{}".format(clever_format(flops), params))
        return
    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume_normal:
        if os.path.isfile(args.resume_normal):
            print("=> loading checkpoint '{}'".format(args.resume_normal))
            checkpoint = torch.load(args.resume_normal)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(args.resume_normal, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume_normal))
    elif args.resume_from: # increse channel_removed_ratio as FBS
        if os.path.isfile(args.resume_from):
            if not args.lasso:
            	print("=> loading pretrained model '{}'".format(args.resume_from))
            	print("=> increase channel removed ratio to '{}'".format(args.channel_removed_ratio))
            	checkpoint = torch.load(args.resume_from)
            	args.start_epoch = 0
            	model.load_state_dict(checkpoint['state_dict'])
            	print("=> loaded pretrained model '{}' (epoch {})".format(args.resume_from, args.start_epoch))
            elif args.lasso:
                print("=> loading pretrained model '{}'".format(args.resume_from))
                print("=> increase channel removed ratio to '{}'".format(args.channel_removed_ratio))
                checkpoint = torch.load(args.resume_from)
                args.start_epoch = 0
                oldmodel = checkpoint['state_dict']
                #for k,v in oldmodel.items():
                #    print(k)
                for key,value in model.state_dict().items():
                    if "channel_l1" in key:
                        continue
                    if "spatial_l1" in key:
                        continue
                    value.copy_(oldmodel[key])
                print("=> loaded pretrained model '{}' (epoch {})".format(args.resume_from, args.start_epoch))
                #return

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)
    log.close()
def train(data_train, data_eval, model, nsp_loss, mlm_loss, vocab_size, ctx):
    """Training function."""
    hvd.broadcast_parameters(model.collect_params(), root_rank=0)

    mlm_metric = nlp.metric.MaskedAccuracy()
    nsp_metric = nlp.metric.MaskedAccuracy()
    mlm_metric.reset()
    nsp_metric.reset()

    logging.debug('Creating distributed trainer...')
    lr = args.lr
    optim_params = {'learning_rate': lr, 'epsilon': 1e-6, 'wd': 0.01}
    if args.dtype == 'float16':
        optim_params['multi_precision'] = True

    dynamic_loss_scale = args.dtype == 'float16'
    if dynamic_loss_scale:
        loss_scale_param = {'scale_window': 2000 / num_workers}
    else:
        loss_scale_param = None
    trainer = hvd.DistributedTrainer(model.collect_params(), 'bertadam', optim_params)
    fp16_trainer = FP16Trainer(trainer, dynamic_loss_scale=dynamic_loss_scale,
                               loss_scaler_params=loss_scale_param)

    if args.start_step:
        state_path = os.path.join(args.ckpt_dir, '%07d.states.%02d'%(args.start_step, local_rank))
        logging.info('Loading trainer state from %s', state_path)
        nlp.utils.load_states(trainer, state_path)

    accumulate = args.accumulate
    num_train_steps = args.num_steps
    warmup_ratio = args.warmup_ratio
    num_warmup_steps = int(num_train_steps * warmup_ratio)
    params = [p for p in model.collect_params().values() if p.grad_req != 'null']
    param_dict = model.collect_params()

    # Do not apply weight decay on LayerNorm and bias terms
    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    if accumulate > 1:
        for p in params:
            p.grad_req = 'add'

    train_begin_time = time.time()
    begin_time = time.time()
    running_mlm_loss, running_nsp_loss = 0, 0
    running_num_tks = 0
    batch_num = 0
    step_num = args.start_step

    logging.debug('Training started')
    while step_num < num_train_steps:
        for _, dataloader in enumerate(data_train):
            if step_num >= num_train_steps:
                break

            # create dummy data loader if needed
            if args.dummy_data_len:
                target_shape = (args.batch_size, args.dummy_data_len)
                dataloader = get_dummy_dataloader(dataloader, target_shape)

            for _, data_batch in enumerate(dataloader):
                if step_num >= num_train_steps:
                    break
                if batch_num % accumulate == 0:
                    step_num += 1
                    # if accumulate > 1, grad_req is set to 'add', and zero_grad is required
                    if accumulate > 1:
                        param_dict.zero_grad()
                    # update learning rate
                    if step_num <= num_warmup_steps:
                        new_lr = lr * step_num / num_warmup_steps
                    else:
                        offset = lr * step_num / num_train_steps
                        new_lr = lr - offset
                    trainer.set_learning_rate(new_lr)
                    if args.profile:
                        profile(step_num, 10, 14, profile_name=args.profile + str(rank))

                # load data
                if args.use_avg_len:
                    data_list = [[seq.as_in_context(context) for seq in shard]
                                 for context, shard in zip([ctx], data_batch)]
                else:
                    data_list = list(split_and_load(data_batch, [ctx]))
                data = data_list[0]

                # forward
                with mx.autograd.record():
                    (ls, ns_label, classified, masked_id, decoded, \
                     masked_weight, ls1, ls2, valid_len) = forward(data, model, mlm_loss,
                                                                   nsp_loss, vocab_size, args.dtype)
                    ls = ls / accumulate
                    # backward
                    if args.dtype == 'float16':
                        fp16_trainer.backward(ls)
                    else:
                        ls.backward()

                running_mlm_loss += ls1.as_in_context(mx.cpu())
                running_nsp_loss += ls2.as_in_context(mx.cpu())
                running_num_tks += valid_len.sum().as_in_context(mx.cpu())

                # update
                if (batch_num + 1) % accumulate == 0:
                    # step() performs 3 things:
                    # 1. allreduce gradients from all workers
                    # 2. checking the global_norm of gradients and clip them if necessary
                    # 3. averaging the gradients and apply updates
                    fp16_trainer.step(1, max_norm=1*num_workers)

                nsp_metric.update([ns_label], [classified])
                mlm_metric.update([masked_id], [decoded], [masked_weight])

                # logging
                if (step_num + 1) % (args.log_interval) == 0 and (batch_num + 1) % accumulate == 0:
                    log(begin_time, running_num_tks, running_mlm_loss / accumulate,
                        running_nsp_loss / accumulate, step_num, mlm_metric, nsp_metric,
                        trainer, args.log_interval)
                    begin_time = time.time()
                    running_mlm_loss = running_nsp_loss = running_num_tks = 0
                    mlm_metric.reset_local()
                    nsp_metric.reset_local()

                # saving checkpoints
                if (step_num + 1) % args.ckpt_interval == 0 and (batch_num + 1) % accumulate == 0:
                    if is_master_node:
                        save_states(step_num, trainer, args.ckpt_dir, local_rank)
                        if local_rank == 0:
                            save_parameters(step_num, model, args.ckpt_dir)
                    if data_eval:
                        # eval data is always based on a fixed npz file.
                        dataset_eval = get_pretrain_data_npz(data_eval, args.batch_size_eval, 1,
                                                             False, False, 1)
                        evaluate(dataset_eval, model, nsp_loss, mlm_loss, len(vocab), [ctx],
                                 args.log_interval, args.dtype)

                batch_num += 1

    if is_master_node:
        save_states(step_num, trainer, args.ckpt_dir, local_rank)
        if local_rank == 0:
            save_parameters(step_num, model, args.ckpt_dir)
    mx.nd.waitall()
    train_end_time = time.time()
    logging.info('Train cost={:.1f}s'.format(train_end_time - train_begin_time))
    def issue_queries(self, query_samples):
        def run_one_batch(cur_batch_size=1, base_index=0):
            inputs_list = []
            token_types_list = []
            valid_length_list = []
            for i in range(cur_batch_size):
                idx = base_index + i
                eval_features = self.qsl.get_features(query_samples[idx].index)
                example_ids, inputs, token_types, valid_length, _, _ = eval_features
                inputs_list.append(inputs)
                token_types_list.append(token_types)
                valid_length_list.append(valid_length)

            max_len = max([len(inp) for inp in inputs_list])
            for i in range(len(inputs_list)):
                inputs_list[i] += [0] * (max_len - len(inputs_list[i]))
                token_types_list[i] += [0] * (max_len -
                                              len(token_types_list[i]))

            inputs = mx.nd.array(inputs_list).as_in_context(self.ctx)
            token_types = mx.nd.array(token_types_list).as_in_context(self.ctx)
            valid_length = mx.nd.array(valid_length_list).as_in_context(
                self.ctx).astype('float32')

            ## run with a batch
            out = self.net(inputs, token_types, valid_length)
            out_np = out.asnumpy()

            out_list = np.split(out_np, cur_batch_size, axis=0)
            for i, o in enumerate(out_list):
                idx = base_index + i
                response_array = array.array(
                    "B",
                    np.array(o).astype(np.float32).tobytes())
                bi = response_array.buffer_info()
                response = lg.QuerySampleResponse(query_samples[idx].id, bi[0],
                                                  bi[1])
                lg.QuerySamplesComplete([response])

        num_samples = len(query_samples)
        if num_samples == 1:
            eval_features = self.qsl.get_features(query_samples[0].index)
            example_ids, inputs, token_types, valid_length, _, _ = eval_features
            inputs = mx.nd.array(inputs).reshape(1, -1)
            token_types = mx.nd.array(token_types).reshape(1, -1)
            valid_length = mx.nd.array(valid_length).reshape(-1, )

            out = self.net(
                inputs.as_in_context(self.ctx),
                token_types.as_in_context(self.ctx),
                valid_length.as_in_context(self.ctx).astype('float32'))
            out = out.asnumpy()

            response_array = array.array(
                "B",
                np.array(out).astype(np.float32).tobytes())
            bi = response_array.buffer_info()
            response = lg.QuerySampleResponse(query_samples[0].id, bi[0],
                                              bi[1])
            lg.QuerySamplesComplete([response])
        else:
            ## TODO, used in batch_size tuning
            if num_samples < self.batch_size:
                if self.logger:
                    self.logger.error(
                        'batch_size {0} is larger than provided samples {1}, consider'
                        ' to decrease batch_size.'.format(
                            self.batch_size, num_samples))
                sys.exit(-1)

            num_batch = num_samples // self.batch_size
            remaining_batch = num_samples % self.batch_size
            if self.logger:
                self.logger.info(
                    'split the datasets into {0} batches with bs={1} and remaining {2}...'
                    .format(num_batch, self.batch_size, remaining_batch))

            start_step = 10
            end_step = 30 if num_batch > 30 else num_batch
            for b in range(num_batch):
                base_index = b * self.batch_size
                profile(b,
                        start_step,
                        end_step,
                        profile_name='profile.json',
                        early_exit=False)
                run_one_batch(self.batch_size, base_index)

            if remaining_batch > 0:
                base_index = num_batch * self.batch_size
                run_one_batch(remaining_batch, base_index)
Beispiel #12
0
                                       transform=transform_test)
testloader = torch.utils.data.DataLoader(testset,
                                         batch_size=100,
                                         shuffle=False,
                                         num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
           'ship', 'truck')

# Model
print('==> Building model..')

get_model = model_dict[args.net]
net = get_model()
print('==> Model:', args.net)
flops, params = profile(net, inputs=(torch.randn(1, 3, 32, 32), ))
print('* MACs: {:,.2f}'.format(flops).replace('.00', ''))
print('* Params: {:,.2f}'.format(params).replace('.00', ''))

if torch.cuda.is_available():
    device = 'cuda'
    print('==> cuda is available (gpu)')
else:
    device = 'cpu'
    print('==> No cuda, running on cpu')
net = net.to(device)
if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

if args.resume:
Beispiel #13
0
    M = mass
    g = 9.8
    
    return jet.array([
        z[1],
        -1.0/(l+z[2])*(2*z[1]*z[3]+g*(m/2+M)/(m/3+M)*jet.sin(z[0])),
        z[3],
        (l+z[2])*z[1]**2+(m/2+M)/(m/3+M)*g*jet.cos(z[0])-1.0/(m/3+M)*k*z[2]
        ])

#Create time steps
time = numpy.linspace(0.0, 10.0, 1e7)

#Specify initial conditions
init = numpy.array([jet.pi  / 2, 0, mass * 9.8 / spring['k'], 0]) # initial values
            #array([theta, theta_dot, x, x_dot])

profile_derive = lambda: profile(lambda: deriv(init, time[0]))
profile_odeint = lambda: profile(lambda: odeint(deriv, init, time))

print('jet_mode = True')
print('derivatives: %f, %f' % (profile_derive(), profile_derive()))
print('integration: %f' % profile_odeint())

jet.set_options(jet_mode=False)

print('---')
print('jet_mode = False')
print('derivatives: %f' % profile_derive())
print('integration: %f' % profile_odeint())
Beispiel #14
0
 def profile_internal(e, o):
     out, result = profile(app)(e, o)
     return list(out) + ["<pre>" + net.websafe(result) + "</pre>"]
Beispiel #15
0
 def profile_internal(e, o):
     out, result = profile(app)(e, o)
     return list(out) + ['<pre>' + net.websafe(result) + '</pre>']
Beispiel #16
0
    while not app.closed:
        app.draw_frame(cube)

        dt = clock.tick(30)
        app.event_hub.raise_event(
            Event(origin=Event.APPLICATION, type=Event.NEWFRAME, dt=dt))
        event_hub.handle_events()


def run_controls_ui():
    dash.mainloop()


if __name__ == '__main__':
    with profile(on=False):
        event_hub = EventsHub()
        cube = RubiksCubeDrawer(event_hub)
        cube.state.pprint()
        # cube.load_state(RubiksCube.SOLVED_STR)

        camera = Camera(event_hub)

        app = OpenGLApp(event_hub)
        dash = Dashboard(event_hub)

        tCube = Thread(target=run_cube_sim)
        tCube.start()

        run_controls_ui()  # Tkinter needs to be called from main thread
        tCube.join()
    def run(self):
        global batching
        #os.sched_setaffinity(self.pid, self.affinity)
        cmd = "taskset -p -c %d-%d %d" % (self.start_core_idx,
                                          self.end_core_idx, self.pid)
        print(cmd)
        os.system(cmd)
        import mxnet as mx
        ctx = mx.cpu()
        #from numexpr.utils import set_num_threads
        #set_num_threads(28)
        os.environ['OMP_NUM_THREADS'] = '{}'.format(self.end_core_idx -
                                                    self.start_core_idx + 1)

        model = BERTModel(mx.cpu(), self.args.vocab, self.args.params,
                          self.args.quantized,
                          self.args.quantized_model_prefix)
        data_set = BERTDataSet(self.args.vocab, self.args.perf_count)

        self.lock.acquire()
        self.calibrate_counter.value += 1
        self.lock.release()
        block_until(self.calibrate_counter, self.world_size)
        if self.args.perf_calibrate:
            self.calibrate(model, data_set, ctx)
            return

        self.lock.acquire()
        self.calibrate_counter.value += 1
        self.lock.release()
        if self.args.warmup:
            self.warmup(model, data_set, ctx, self.args.scenario)

        self.lock.acquire()
        self.init_counter.value += 1
        self.lock.release()

        #affinity = os.sched_getaffinity(self.pid)
        #print('Process', self.pid, 'affinity proc list:', affinity)
        cur_step = 0
        start_step = 384
        end_step = -1
        from utils import profile

        while True:
            next_task = self.task_queue.get()  #(self.proc_idx)
            if next_task is None:
                # None means shutdown
                log.info('Exiting {}-pid:{}, cur_step={}'.format(
                    self.name, self.pid, cur_step))
                self.task_queue.task_done()
                if self.args.profile and self.proc_idx == 0:
                    if end_step == -1:
                        end_step = cur_step
                    profile(cur_step,
                            start_step,
                            end_step,
                            profile_name='profile_{}.json'.format(self.pid),
                            early_exit=False)
                break

            query_id_list = next_task.query_id_list
            sample_index_list = next_task.sample_index_list
            batch_size = len(sample_index_list)
            #print ('pid-{}, query_id_list: {}, sample_index_list: {}'.format(self.pid, query_id_list, sample_index_list))
            inputs_list = []
            token_types_list = []
            valid_length_list = []
            for sample_index in sample_index_list:
                eval_feature = data_set.eval_features[sample_index]
                _, inputs, token_types, valid_length, _, _ = eval_feature
                inputs_list.append(inputs)
                token_types_list.append(token_types)
                valid_length_list.append(valid_length)

            if len(inputs_list) > 1:
                max_len = max([len(inp) for inp in inputs_list])
                new_max_len, bs, best_throughput = get_best_bs(max_len)
                if bs == len(inputs_list):
                    max_len = new_max_len
                #for i in range(len(inputs_list)):
                #    inputs_list[i] += [0] * (max_len - len(inputs_list[i]))
                #    token_types_list[i] += [0] * (max_len - len(token_types_list[i]))
            else:
                max_len = self.max_pad_len  #len(inputs_list[0]) #self.max_pad_len #len(inputs_list)

            for i in range(len(inputs_list)):
                inputs_list[i] += [0] * (max_len - len(inputs_list[i]))
                token_types_list[i] += [0] * (max_len -
                                              len(token_types_list[i]))

            inputs = mx.nd.array(inputs_list).as_in_context(ctx)
            token_types = mx.nd.array(token_types_list).as_in_context(ctx)
            valid_length = mx.nd.array(valid_length_list).as_in_context(
                ctx).astype('float32')

            if self.args.profile and self.proc_idx == 0:
                profile(cur_step,
                        start_step,
                        end_step,
                        profile_name='profile_{}.json'.format(self.pid),
                        early_exit=False)
                cur_step += 1
            #t0 = time.time()
            out = model.net(inputs, token_types, valid_length)
            out_np = out.asnumpy()
            #t1 = time.time()
            #if self.proc_idx == 0:
            #    cur_throughput = len(inputs_list)/(t1-t0)
            #    if best_throughput != 0:
            #        throughput_diff = (cur_throughput - best_throughput) / best_throughput
            #        print ('inference seq len = {} BS = {} throughput = {:.5f} ({:.3f}%)'.format(max_len, len(inputs_list), cur_throughput, throughput_diff*100))
            #    else:
            #        print ('inference seq len = {} BS = {} throughput = {:.5f})'.format(max_len, len(inputs_list), cur_throughput))
            result = Output(query_id_list, out_np)
            self.result_queue.put(result)
            #print('consumer-{}: output.shape={}, query_id={}'.format(self.pid, out_np.shape, query_id_list[0]))
            self.task_queue.task_done()
Beispiel #18
0
 def profile_internal(e, o):
     out, result = profile(app)(e, o)
     return list(out) + ['<pre>' + net.websafe(result) + '</pre>']
Beispiel #19
0
import FaultyMemory as FyM
import torch
from utils import profile
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {device}")

resnet18 = models.resnet18(pretrained=True).to(device)
dummy_tensor = torch.randn([32, 3, 32, 32]).to(device)
representation = FyM.SlowFixedPointRepresentation()


def inference_parameters():
    handler = FyM.Handler(resnet18)
    handler.add_net_parameters(representation)
    handler(dummy_tensor)


_ = profile(inference_parameters, __file__, device)