コード例 #1
0
ファイル: micro_bench.py プロジェクト: mila-iqia/training
def run_benchmarking(net,
                     batch_size,
                     iterations,
                     run_fp16,
                     dataparallel,
                     distributed_dataparallel,
                     device_ids=None,
                     distributed_parameters=None):
    if device_ids:
        torch.cuda.set_device("cuda:%d" % device_ids[0])
    else:
        torch.cuda.set_device("cuda:0")

    network = get_network(net)
    if run_fp16:
        network = network_to_half(network)

    if dataparallel:
        network = torch.nn.DataParallel(network, device_ids=device_ids)
        num_devices = len(
            device_ids) if device_ids is not None else torch.cuda.device_count(
            )

    elif distributed_dataparallel:
        rendezvous(distributed_parameters)
        network = torch.nn.parallel.DistributedDataParallel(
            network, device_ids=device_ids)
        num_devices = len(
            device_ids) if device_ids is not None else torch.cuda.device_count(
            )

    else:
        num_devices = 1

    if net == "inception_v3":
        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
    else:
        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")

    if run_fp16:
        inp = inp.half()

    target = torch.randint(
        0, 1, size=(batch_size, ),
        device='cuda')  # torch.arange(batch_size, device="cuda")

    param_copy = network.parameters()
    if run_fp16:
        param_copy = get_param_copy(network)

    optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9)

    ## warmup.
    print("INFO: running forward and backward for warmup.")
    forwardbackward(inp, optimizer, network, target)
    forwardbackward(inp, optimizer, network, target)

    time.sleep(1)
    torch.cuda.synchronize()

    ## benchmark.
    print("INFO: running the benchmark..")
    tm = time.time()
    for i in range(iterations):
        forwardbackward(inp, optimizer, network, target)

    torch.cuda.synchronize()

    tm2 = time.time()
    time_per_batch = (tm2 - tm) / iterations
    rank = distributed_parameters.get('rank', -1)
    world_size = distributed_parameters.get('world_size', 1)

    process_report = {
        'model': net,
        'rank': rank,
        'num_device': num_devices,
        'batch_size': batch_size,
        'batch_time': time_per_batch,
        'speed': batch_size / time_per_batch
    }

    with open(f'{tmp}/process_report_{rank}.json', 'w') as report:
        json.dump(process_report, report)

    if rank == 0:
        overall_report = {
            'world_size': world_size,
            'batch_size': batch_size * world_size,
            'batch_time': time_per_batch,
            'speed': batch_size * world_size / time_per_batch
        }
        with open(f'{tmp}/overall_report.json', 'w') as report:
            json.dump(overall_report, report)
コード例 #2
0
def run_benchmarking(local_rank, ngpus, net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None):
    if device_ids:
        assert ngpus == len(device_ids)
        torch.cuda.set_device("cuda:%d" % device_ids[local_rank])
    else:
        torch.cuda.set_device("cuda:0")

    network = get_network(net)
    if (run_fp16):
        network = network_to_half(network)

    if (dataparallel):
        devices_to_run_on = device_ids if device_ids else list(range(ngpus))
        print ("INFO: Running dataparallel on devices: {}".format(str(devices_to_run_on)))
        network = torch.nn.DataParallel(network, device_ids=devices_to_run_on)
    elif (distributed_dataparallel):
        distributed_parameters['rank'] += local_rank
        rendezvous(distributed_parameters)
        devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)]
        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
        batch_size = int(batch_size / ngpus)

    if (net == "inception_v3"):
        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
    else:
        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")
    if (run_fp16):
        inp = inp.half()
    target = torch.arange(batch_size, device="cuda")
    param_copy = network.parameters()
    if (run_fp16):
        param_copy = get_param_copy(network)
    optimizer = torch.optim.SGD(param_copy, lr = 0.01, momentum = 0.9)

    ## warmup.
    print ("INFO: running forward and backward for warmup.")
    forwardbackward(inp, optimizer, network, target)
    forwardbackward(inp, optimizer, network, target)

    time.sleep(1)
    torch.cuda.synchronize()

    ## benchmark.
    print ("INFO: running the benchmark..")
    tm = time.time()
    for i in range(iterations):
        forwardbackward(inp, optimizer, network, target)
        if(i%10==0):
            print (time.asctime( time.localtime(time.time())) + " INFO: iteration " + str(i) + " completed.")
    torch.cuda.synchronize()
    
    tm2 = time.time()
    time_per_batch = (tm2 - tm) / iterations

    print ("OK: finished running benchmark..")
    print ("--------------------SUMMARY--------------------------")
    print ("Microbenchmark for network : {}".format(net))
    if (distributed_dataparallel):
      print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
      print ("Num devices: 1")
    else:
      print ("Num devices: {}".format(ngpus))
    print ("Mini batch size [img] : {}".format(batch_size))
    print ("Time per mini-batch : {}".format(time_per_batch))
    print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch))
    if (distributed_dataparallel):
      print ("")
      print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
      world_size = distributed_parameters['world_size']
      print ("Num devices: {}".format(world_size))
      print ("Mini batch size [img] : {}".format(batch_size*world_size))
      print ("Time per mini-batch : {}".format(time_per_batch))
      print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
コード例 #3
0
def run_benchmarking(net,
                     batch_size,
                     iterations,
                     run_fp16,
                     dataparallel,
                     distributed_dataparallel,
                     device_ids=None,
                     distributed_parameters=None):
    if device_ids:
        torch.cuda.set_device("cuda:%d" % device_ids[0])
    else:
        torch.cuda.set_device("cuda:0")

    network = get_network(net)
    print('Total parameters:', count_parameters(network))

    if (run_fp16):
        network = network_to_half(network)

    if (dataparallel):
        network = torch.nn.DataParallel(network, device_ids=device_ids)
        num_devices = len(
            device_ids) if device_ids is not None else torch.cuda.device_count(
            )
    elif (distributed_dataparallel):
        rendezvous(distributed_parameters)
        network = torch.nn.parallel.DistributedDataParallel(
            network, device_ids=device_ids)
        num_devices = len(
            device_ids) if device_ids is not None else torch.cuda.device_count(
            )
    else:
        num_devices = 1

    if (net == "inception_v3"):
        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
    else:
        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")
    if (run_fp16):
        inp = inp.half()
    target = torch.arange(batch_size, device="cuda")
    param_copy = network.parameters()
    if (run_fp16):
        param_copy = get_param_copy(network)
    optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9)

    ## warmup.
    print("INFO: running forward and backward for warmup.")
    forwardbackward(inp, optimizer, network, target)
    forwardbackward(inp, optimizer, network, target)

    time.sleep(1)
    torch.cuda.synchronize()

    ## benchmark.
    print("INFO: running the benchmark..")
    tm = time.time()
    for i in range(iterations):
        forwardbackward(inp, optimizer, network, target)
    torch.cuda.synchronize()

    tm2 = time.time()
    time_per_batch = (tm2 - tm) / iterations

    print("OK: finished running benchmark..")
    print("--------------------SUMMARY--------------------------")
    print("Microbenchmark for network : {}".format(net))
    if (distributed_dataparallel):
        print("--------This process: rank " +
              str(distributed_parameters['rank']) + "--------")
    print("Num devices: {}".format(num_devices))
    print("Mini batch size [img] : {}".format(batch_size))
    print("Time per mini-batch : {}".format(time_per_batch))
    print("Throughput [img/sec] : {}".format(batch_size / time_per_batch))
    if (distributed_dataparallel):
        print("")
        print(
            "--------Overall (all ranks) (assuming same num/type devices for each rank)--------"
        )
        world_size = distributed_parameters['world_size']
        print("Num devices: {}".format(num_devices * world_size))
        print("Mini batch size [img] : {}".format(batch_size * world_size))
        print("Time per mini-batch : {}".format(time_per_batch))
        print("Throughput [img/sec] : {}".format(batch_size * world_size /
                                                 time_per_batch))
def run_benchmarking(local_rank,
                     ngpus,
                     net,
                     batch_size,
                     iterations,
                     prof_step,
                     amp_opt_level,
                     run_fp16,
                     dataparallel,
                     distributed_dataparallel,
                     device_ids=None,
                     distributed_parameters=None):
    if device_ids:
        assert ngpus == len(device_ids)
        torch.cuda.set_device("cuda:%d" % device_ids[local_rank])
    else:
        torch.cuda.set_device("cuda:0")

    network = get_network(net)
    if "shufflenet" == net:
        model.apply(weight_init)

    if (run_fp16):
        network = network_to_half(network)

    if (dataparallel):
        devices_to_run_on = device_ids if device_ids else list(range(ngpus))
        print("INFO: Running dataparallel on devices: {}".format(
            str(devices_to_run_on)))
        network = torch.nn.DataParallel(network, device_ids=devices_to_run_on)
    elif (distributed_dataparallel):
        distributed_parameters['rank'] += local_rank
        rendezvous(distributed_parameters)
        devices_to_run_on = [
            (device_ids[local_rank] if device_ids else local_rank)
        ]
        print("INFO: Rank {} running distributed_dataparallel on devices: {}".
              format(distributed_parameters['rank'], str(devices_to_run_on)))
        network = torch.nn.parallel.DistributedDataParallel(
            network, device_ids=devices_to_run_on)
        batch_size = int(batch_size / ngpus)

    if (net == "inception_v3"):
        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
    else:
        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")
    if (run_fp16):
        inp = inp.half()
    if net in models:
        # number of classes is 1000 for imagenet
        target = torch.randint(0, 1000, (batch_size, ), device="cuda")
    elif net in segmentation_models:
        # number of classes is 21 for segmentation
        target = torch.randint(0, 21, (batch_size, ), device="cuda")
    param_copy = network.parameters()
    if (run_fp16):
        param_copy = get_param_copy(network)
    optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9)

    if (amp_opt_level):
        network, optimizer = apex.amp.initialize(network,
                                                 optimizer,
                                                 opt_level="O%d" %
                                                 amp_opt_level)

    ## warmup.
    print("INFO: running forward and backward for warmup.")
    forwardbackward(inp, optimizer, network, target, amp_opt_level)
    forwardbackward(inp, optimizer, network, target, amp_opt_level)

    time.sleep(1)
    torch.cuda.synchronize()

    ## benchmark.
    print("INFO: running the benchmark..")
    tm = time.time()
    for i in range(iterations):
        if i == prof_step:
            forwardbackward(inp, optimizer, network, target, amp_opt_level, i)
        else:
            forwardbackward(inp, optimizer, network, target, amp_opt_level)
    torch.cuda.synchronize()

    tm2 = time.time()
    time_per_batch = (tm2 - tm) / iterations

    if run_fp16:
        dtype = 'FP16'
    elif amp_opt_level == 1:
        dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.'
    elif amp_opt_level == 2:
        dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.'
    elif amp_opt_level == 3:
        dtype = 'AMP-O3: Pure FP16 training.'
    elif amp_opt_level == 4:
        dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.'
    elif amp_opt_level == 5:
        dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.'
    else:
        dtype = 'FP32'

    print("OK: finished running benchmark..")
    print("--------------------SUMMARY--------------------------")
    print("Microbenchmark for network : {}".format(net))
    if (distributed_dataparallel):
        print("--------This process: rank " +
              str(distributed_parameters['rank']) + "--------")
        print("Num devices: 1")
    else:
        print("Num devices: {}".format(ngpus))
    print("Dtype: {}".format(dtype))
    print("Mini batch size [img] : {}".format(batch_size))
    print("Time per mini-batch : {}".format(time_per_batch))
    print("Throughput [img/sec] : {}".format(batch_size / time_per_batch))
    if (distributed_dataparallel):
        print("")
        print(
            "--------Overall (all ranks) (assuming same num/type devices for each rank)--------"
        )
        world_size = distributed_parameters['world_size']
        print("Num devices: {}".format(world_size))
        print("Dtype: {}".format(dtype))
        print("Mini batch size [img] : {}".format(batch_size * world_size))
        print("Time per mini-batch : {}".format(time_per_batch))
        print("Throughput [img/sec] : {}".format(batch_size * world_size /
                                                 time_per_batch))