예제 #1
0
def prefetch_pytorch_loader(args, train=True, pin_memory=True):
    from MixedPrecision.tools.prefetcher import DataPreFetcher
    from MixedPrecision.tools.stats import StatStream
    import MixedPrecision.tools.utils as utils

    data_transforms = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
    ])

    train_dataset = TimedImageFolder(args.data, data_transforms)

    loader = torch.utils.data.DataLoader(train_dataset,
                                         batch_size=args.batch_size,
                                         shuffle=True,
                                         num_workers=args.workers,
                                         pin_memory=pin_memory,
                                         collate_fn=utils.timed_fast_collate)

    mean = utils.enable_half(
        torch.tensor([0.485 * 255, 0.456 * 255,
                      0.406 * 255]).float()).view(1, 3, 1, 1)
    std = utils.enable_half(
        torch.tensor([0.229 * 255, 0.224 * 255,
                      0.225 * 255]).float()).view(1, 3, 1, 1)

    return DataPreFetcher(loader,
                          mean=mean,
                          std=std,
                          cpu_stats=StatStream(drop_first_obs=10),
                          gpu_stats=StatStream(drop_first_obs=10))
예제 #2
0
def benchmark_loader(args):
    import time
    import socket

    from MixedPrecision.tools.stats import StatStream
    from MixedPrecision.tools.prefetcher import AsyncPrefetcher
    import MixedPrecision.tools.report as report

    def ignore(x, y):
        pass

    s = time.time()

    data = load_dataset(args)

    stat = StatStream(20)
    prof = args.prof
    print('Init time was {:.4f}'.format(time.time() - s))
    print('Starting..')

    start = time.time()

    for j, (x, y) in enumerate(data):
        #x = x.cuda()
        #y = y.cuda()

        ignore(x, y)

        end = time.time()
        current_time = end - start
        stat += current_time

        if j > prof:
            break

        if stat.avg > 0:
            print('[{:4d}] {:.4f} (avg: {:.4f} img/s)'.format(
                j, args.batch_size / current_time, args.batch_size / stat.avg))

        start = time.time()

    print('Done')

    hostname = socket.gethostname()
    #current_device = torch.cuda.current_device()
    #gpu = torch.cuda.get_device_name(current_device)
    gpu = 0
    bs = args.batch_size

    common = [args.batch_size, args.workers, args.loader, hostname, gpu]
    report.print_table([
        'Metric', 'Average', 'Deviation', 'Min', 'Max', 'count', 'batch',
        'workers', 'loader', 'hostname', 'GPU'
    ], [['Load Time (s)'] + stat.to_array() + common,
        [
            'Load Speed (img/s)', bs / stat.avg, 'NA', bs / stat.max,
            bs / stat.min, stat.count
        ] + common])
예제 #3
0
def preprocess_to_lmdb(transform, input_folder: str, output_file: str):
    train_dataset = torchvision.datasets.ImageFolder(input_folder, transform)

    n = len(train_dataset)

    print(output_file)
    env = lmdb.open(output_file, map_size=3 * 256 * 256 * n * 9)

    load_time = StatStream(10)
    save_time = StatStream(10)
    start = time.time()

    print('Converting...')

    for index, (x, y) in enumerate(train_dataset):
        end = time.time()
        load_time += end - start
        s = time.time()
        with env.begin(write=True, buffers=True) as txn:
            # convert to uint8
            x = np.array(x, dtype=np.uint8)
            x = np.moveaxis(x, -1, 0)

            #datum = array_to_datum(X, y)

            tensor_protos = caffe2_pb2.TensorProtos()
            img_tensor = tensor_protos.protos.add()
            img_tensor.dims.extend(x.shape)
            img_tensor.data_type = caffe2_pb2.TensorProto.UINT8

            flatten_img = x.reshape(np.prod(x.shape))
            img_tensor.int32_data.extend(flatten_img)

            label_protos = caffe2_pb2.TensorProtos()
            label_tensor = tensor_protos.protos.add()
            label_tensor.data_type = caffe2_pb2.TensorProto.INT32
            label_tensor.int32_data.append(y)

            txn.put('x_{}'.format(index).encode('ascii'),
                    tensor_protos.SerializeToString())
            txn.put('y_{}'.format(index).encode('ascii'),
                    label_protos.SerializeToString())

        e = time.time()

        save_time += e - s

        if index % 100 == 0 and load_time.avg > 0:
            print(
                '{:.4f} % Load[avg: {:.4f} img/s sd: {:.4f}] Save[avg: {:.4f} img/s sd: {:.4f}]'
                .format(index * 100 / n, 1 / load_time.avg, load_time.sd,
                        1 / save_time.avg, save_time.sd))

        start = time.time()

    env.close()
    print('{:.4f} img/s'.format(1 / load_time.avg))
예제 #4
0
    def __init__(self, stages: List[str], drop=10):
        self.names = stages
        self.stages = [StatStream(drop) for _ in stages]

        self.current_stage = 0
        self.start_time = 0
        self.end_time = 0
        self.total_s = 0
        self.total = StatStream(drop)
예제 #5
0
def preprocess_to_hdf5(transform, input_folder: str, output_file: str):
    train_dataset = torchvision.datasets.ImageFolder(
        input_folder,
        transform)

    output = h5py.File(output_file, 'w', libver='latest')

    # >>>>>>
    # Stores an Array of String representing Index -> class
    classes = output.create_dataset('classes', (1000,), dtype='S9')
    cls = list(train_dataset.class_to_idx.items())
    cls.sort(key=lambda x: x[1])

    for (key, index) in cls:
        classes[index] = np.string_(key)

    # <<<<<<
    n = len(train_dataset)
    hdy = output.create_dataset('label', (n,), dtype=np.uint8)
    hdx = output.create_dataset(
        'data',
        (n, 3, 256, 256),
        dtype=np.uint8,
        chunks=(1, 3, 256, 256),  # Chunk Per sample for fast retrieval
        compression='lzf')

    load_time = StatStream(10)
    save_time = StatStream(10)
    start = time.time()

    print('Converting...')

    for index, (x, y) in enumerate(train_dataset):
        end = time.time()
        load_time += end - start

        s = time.time()

        # convert to uint8
        x = np.array(x, dtype=np.uint8)

        hdy[index] = y
        hdx[index] = np.moveaxis(x, -1, 0)

        e = time.time() 

        save_time += e - s

        if index % 100 == 0 and load_time.avg > 0:
            print('{:.4f} % Load[avg: {:.4f} img/s sd: {:.4f}] Save[avg: {:.4f} img/s sd: {:.4f}]'.format(
                index * 100 / n, 1 / load_time.avg, load_time.sd, 1 / save_time.avg, save_time.sd))

        start = time.time()

    output.close()
    print('{:.4f} img/s'.format(1 / load_time.avg))
예제 #6
0
    def __init__(self,
                 root,
                 transform=None,
                 target_transform=None,
                 loader=pil_loader):
        self.zipfile = zipfile.ZipFile(root, 'r')
        self.loader = loader
        self.x_transform = transform
        self.y_transform = target_transform
        self.classes, self.classes_to_idx, self.files = self.find_classes(
            self.zipfile.namelist())

        self._read_timer = StatStream(10)
        self._transform_timer = StatStream(10)
예제 #7
0
    def __init__(self, work, results, loader):
        super().__init__(name='AsyncPrefetcherWorker')

        self.work = work
        self.results = results
        self.loader = loader
        self.stat = StatStream(10)
예제 #8
0
def main():
    from MixedPrecision.tools.loaders import hdf5_loader
    from MixedPrecision.tools.utils import show_args
    import argparse

    parser = argparse.ArgumentParser('Image Net Preprocessor')
    parser.add_argument('--input', type=str, help='Input directory')
    parser.add_argument('--output', type=str, help='Output directory')
    parser.add_argument('--test-only', action='store_true', default=False,
                        help='Do not run the preprocessor')
    parser.add_argument('--speed-test', action='store_true', default=False,
                        help='Run the speed test on the created dataset')
    parser.add_argument('--batch-size', type=int, default=128,
                        help='Batch size to use for the speed test')
    parser.add_argument('--workers', type=int, default=4,
                        help='Number of worker to use for the speed trest')

    t = transforms.Compose([
        transforms.Resize((256, 256))
    ])

    args = parser.parse_args()
    show_args(args)

    if not args.test_only:
        s = time.time()
        preprocess_to_hdf5(t, args.input, args.output)
        e = time.time()
        print('Preprocessed Dataset in {:.4f} min'.format((e - s) / 60))

    if args.speed_test:
        print('Speed test')

        # Create a new args that is usable by our data loader
        args = argparse.Namespace(
            data=args.output,
            workers=args.workers,
            batch_size=args.batch_size
        )
        loader = hdf5_loader(args)
        print(' - {} images available'.format(len(loader.dataset)))

        load = StatStream(20)
        start = time.time()

        for index, (x, y) in enumerate(loader):
            end = time.time()
            ignore(x, y)
            load += end - start

            if index > 100:
                break

            start = time.time()

        print(' - {:.4f} img/sec (min={:.4f}, max={:.4f})'.format(
            args.batch_size / load.avg, args.batch_size / load.max, args.batch_size / load.min))

        print(' - {:.4f} sec (min={:.4f}, max={:.4f}, sd={:.4f})'.format(
            load.avg, load.min, load.max, load.sd))
예제 #9
0
def train(args, dataset):
    shape = args.shape
    nbatch = args.batch_size
    nin = shape[0] * shape[1] * shape[2]
    conv_num = args.conv_num
    learning_rate = args.lr
    momentum = args.momentum
    loss_scale = args.static_loss_scale
    dtype = tf.float16 if args.half else tf.float32

    tf.set_random_seed(0)
    np.random.seed(0)

    device = '/gpu:0' if args.gpu else '/cpu'

    # Create training graph
    # ------------------------------------------------------------------------------------------------------------------
    with tf.device(device), \
         tf.variable_scope(
             # Note: This forces trainable variables to be stored as float32
             'fp32_storage', custom_getter=float32_variable_storage_getter):

        data, target, loss = create_simple_model(nbatch, shape, conv_num, 512, 10, dtype)

        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

        # Note: Loss scaling can improve numerical stability for fp16 training
        grads = gradients_with_loss_scaling(loss, variables, loss_scale)

        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)

        training_step_op = optimizer.apply_gradients(zip(grads, variables))

        init_op = tf.global_variables_initializer()
    # ------------------------------------------------------------------------------------------------------------------

    # Run training
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=args.log_device_placement))
    sess.run(init_op)

    compute_time = StatStream(1)
    floss = float('inf')

    for epoch in range(0, args.epochs):
        cstart = time.time()

        for batch in dataset:
            x, y = batch

            floss, _ = sess.run([loss, training_step_op], feed_dict={data: x, target: y})

        cend = time.time()
        compute_time += cend - cstart

        print('[{:4d}] Compute Time (avg: {:.4f}, sd: {:.4f}) Loss: {:.4f}'.format(
            1 + epoch, compute_time.avg, compute_time.sd, floss))
def train(args, model, data):
    import time

    import MixedPrecision.tools.utils as utils
    from MixedPrecision.tools.optimizer import OptimizerAdapter
    from MixedPrecision.tools.stats import StatStream

    model = utils.enable_cuda(model)
    model = utils.enable_half(model)

    criterion = utils.enable_cuda(nn.CrossEntropyLoss())
    criterion = utils.enable_half(criterion)

    optimizer = optim.SGD(
        model.parameters(),
        lr=args.lr,
        momentum=args.momentum,
        weight_decay=args.weight_decay
    )
    optimizer = OptimizerAdapter(
        optimizer,
        static_loss_scale=args.static_loss_scale,
        dynamic_loss_scale=args.dynamic_loss_scale
    )

    model.train()

    compute_time = StatStream(1)
    floss = float('inf')

    for epoch in range(0, args.epochs):
        cstart = time.time()

        for batch in data:
            x, y = batch

            x = utils.enable_cuda(x)
            y = utils.enable_cuda(y)

            x = utils.enable_half(x)
            out = model(x)

            loss = criterion(out, y)

            floss = loss.item()

            optimizer.zero_grad()
            optimizer.backward(loss)
            optimizer.step()

        cend = time.time()
        compute_time += cend - cstart

        print('[{:4d}] Compute Time (avg: {:.4f}, sd: {:.4f}) Loss: {:.4f}'.format(
            1 + epoch, compute_time.avg, compute_time.sd, floss))
예제 #11
0
    def __init__(self, loader, buffering=2):
        self.loader = iter(loader)
        self.data = None
        self.loading_stat = StatStream(1)
        self.wait_time = StatStream(1)
        #self.manager = Manager()
        #self.work_queue = self.manager.Queue()
        #self.result_queue = self.manager.Queue()

        self.work_queue = multiprocessing.SimpleQueue()
        self.result_queue = multiprocessing.SimpleQueue()

        # MP implementation
        #self.worker = multiprocessing.Process(target=prefetch, args=(self.work_queue, self.result_queue, self.loader, self.loading_stat))
        #self.worker.start()

        self.worker = AsyncPrefetcherWorker(self.work_queue, self.result_queue,
                                            self.loader)
        self.worker.start()

        # put n batch in advance
        for i in range(buffering):
            self.work_queue.put('next')
예제 #12
0
    def __init__(self,
                 loader,
                 mean,
                 std,
                 cpu_stats=StatStream(0),
                 gpu_stats=StatStream(0)):
        print('Using Prefetcher')
        self.loader = iter(loader)
        self.mean = mean
        self.std = std
        self.next_target = None
        self.next_input = None
        self.stream = self._make_stream()

        self.start_event = torch.cuda.Event(enable_timing=True,
                                            blocking=False,
                                            interprocess=False)
        self.end_event = torch.cuda.Event(enable_timing=True,
                                          blocking=False,
                                          interprocess=False)

        self.gpu_time = gpu_stats
        self.cpu_time = cpu_stats
        self.preload()
예제 #13
0
class MultiStageChrono:
    def __init__(self, stages: List[str], drop=10):
        self.names = stages
        self.stages = [StatStream(drop) for _ in stages]

        self.current_stage = 0
        self.start_time = 0
        self.end_time = 0
        self.total_s = 0
        self.total = StatStream(drop)

    def start(self):
        if self.start_time == 0:
            self.start_time = time.time()
            self.total_s = self.start_time
        else:
            self.end_time = time.time()
            self.stages[self.current_stage] += (self.end_time -
                                                self.start_time)
            self.start_time = self.end_time
            self.current_stage += 1

    def end(self):
        self.end_time = time.time()
        self.stages[self.current_stage] += (self.end_time - self.start_time)
        self.total += self.end_time - self.total_s
        self.current_stage = 0
        self.start_time = 0

    def make_table(self, common: List = None, transform=None):
        common = common or []
        table = []

        for i, stream in enumerate(self.stages):
            table.append([self.names[i]] + stream.to_array(transform) + common)

        table.append(['Total'] + self.total.to_array(transform) + common)
        return table

    def report(self, speed=False, size=1):
        import MixedPrecision.tools.report as report

        header = ['Stage', 'Average', 'Deviation', 'Min', 'Max', 'count']
        table = self.make_table(
            None, lambda x: size / x) if speed else self.make_table()
        report.print_table(header, table)
예제 #14
0
 def __init__(self, loop_interval, device_id):
     self.options = [
         '--format=csv', '--loop-ms=' + str(loop_interval),
         '--id=' + str(device_id)
     ]
     self.streams = [StatStream(drop_first_obs=2) for _ in metrics]
     self.n = len(metrics)
     self.process = None
     self.running = True
     self.dispatcher = {
         'name': self.process_ignore,
         'temperature.gpu': self.process_value,
         'utilization.gpu': self.process_percentage,
         'utilization.memory': self.process_percentage,
         'memory.total': self.process_memory,
         'memory.free': self.process_memory,
         'memory.used': self.process_memory
     }
예제 #15
0
def preprocess(transform, input_folder, output_folder):
    train_dataset = torchvision.datasets.ImageFolder(input_folder, transform)

    load_time = StatStream(10)

    start = time.time()
    for index, (x, y) in enumerate(train_dataset):
        end = time.time()
        load_time += end - start

        class_name = train_dataset.classes[y]
        output_dir = '{}/{}'.format(output_folder, class_name)
        os.makedirs(output_dir, mode=0o755, exist_ok=True)

        out = '{}/{}_{}.jpeg'.format(output_dir, class_name, index)
        x.save(out, 'JPEG')

        start = time.time()

    print('avg: {:.4f}s sd: {:.4f} {}'.format(load_time.avg, load_time.sd,
                                              load_time.count))
    print('{:.4f} img/s'.format(1 / load_time.avg))
예제 #16
0
def train(args, model, dataset, name, is_warmup=False):
    import time

    import MixedPrecision.tools.utils as utils
    from MixedPrecision.tools.optimizer import OptimizerAdapter
    from MixedPrecision.tools.stats import StatStream
    from MixedPrecision.tools.monitor import make_monitor

    model = utils.enable_cuda(model)

    if args.half:
        from apex.fp16_utils import network_to_half
        model = network_to_half(model)

    criterion = utils.enable_cuda(nn.CrossEntropyLoss())
    # No Half precision for the criterion
    # criterion = utils.enable_half(criterion)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    optimizer = OptimizerAdapter(optimizer,
                                 half=args.half,
                                 static_loss_scale=args.static_loss_scale,
                                 dynamic_loss_scale=args.dynamic_loss_scale)
    model.train()

    epoch_compute = StatStream(drop_first_obs=10)
    batch_compute = StatStream(drop_first_obs=10)
    gpu_compute = StatStream(drop_first_obs=10)
    compute_speed = StatStream(drop_first_obs=10)
    effective_speed = StatStream(drop_first_obs=10)
    data_waiting = StatStream(drop_first_obs=10)
    data_loading_gpu = StatStream(drop_first_obs=10)
    data_loading_cpu = StatStream(drop_first_obs=10)
    full_time = StatStream(drop_first_obs=10)
    iowait = StatStream(drop_first_obs=10)
    transfert_time = StatStream(drop_first_obs=10)

    start_event = torch.cuda.Event(enable_timing=True,
                                   blocking=False,
                                   interprocess=False)
    end_event = torch.cuda.Event(enable_timing=True,
                                 blocking=False,
                                 interprocess=False)

    floss = float('inf')

    # Stop after n print when benchmarking (n * batch_count) batch
    print_count = 0
    monitor_proc, gpu_monitor = make_monitor(loop_interval=250)

    def should_run():
        if args.prof is None:
            return True
        return print_count < args.prof

    try:
        for epoch in range(0, args.epochs):
            epoch_compute_start = time.time()

            # Looks like it only compute for the current process and not the children
            data_time_start = time.time()

            batch_count = 0
            effective_batch = 0

            for index, (x, y) in enumerate(dataset):
                transfert_start = time.time()
                x = x.cuda()
                y = y.cuda().long()
                torch.cuda.synchronize()

                data_time_end = time.time()
                transfert_time += (data_time_end - transfert_start)
                data_waiting += (data_time_end - data_time_start)

                # compute output
                batch_compute_start = time.time()

                output = model(x)
                loss = criterion(output, y)
                floss = loss.item()

                # compute gradient and do SGD step
                optimizer.zero_grad()
                optimizer.backward(loss)
                optimizer.step()

                #print(floss)
                torch.cuda.synchronize()
                batch_compute_end = time.time()
                full_time += batch_compute_end - data_time_start
                batch_compute += batch_compute_end - batch_compute_start

                compute_speed += args.batch_size / (batch_compute_end -
                                                    batch_compute_start)
                effective_speed += args.batch_size / (batch_compute_end -
                                                      data_time_start)

                effective_batch += 1

                data_time_start = time.time()

                batch_count += 1

                if effective_batch % 10 == 0:
                    print_count += 1
                    speed_avg = args.batch_size / batch_compute.avg

                    print(
                        '[{:4d}][{:4d}] '
                        'Batch Time (avg: {batch_compute.avg:.4f}, sd: {batch_compute.sd:.4f}) '
                        'Speed (avg: {speed:.4f}) '
                        'Data (avg: {data_waiting.avg:.4f}, sd: {data_waiting.sd:.4f})'
                        .format(1 + epoch,
                                batch_count,
                                batch_compute=batch_compute,
                                speed=speed_avg,
                                data_waiting=data_waiting))

                epoch_compute_end = time.time()
                epoch_compute.update(epoch_compute_end - epoch_compute_start)

                if not should_run():
                    break
            if not should_run():
                break
    finally:
        print('Done')
        gpu_monitor.stop()
        monitor_proc.terminate()

    if not is_warmup:
        hostname = socket.gethostname()
        current_device = torch.cuda.current_device()
        gpu = torch.cuda.get_device_name(current_device)
        gpu = gpu[0:min(10, len(gpu))].strip()

        bs = args.batch_size
        loader = args.loader

        header = [
            'Metric', 'Average', 'Deviation', 'Min', 'Max', 'count', 'half',
            'batch', 'workers', 'loader', 'model', 'hostname', 'GPU'
        ]
        common = [
            args.half, args.batch_size, args.workers, loader, name, hostname,
            gpu
        ]

        report_data = [
            ['Waiting for data (s)'] + data_waiting.to_array() + common,
            ['GPU Compute Time (s)'] + batch_compute.to_array() + common,
            ['Full Batch Time (s)'] + full_time.to_array() + common,
            [
                'Compute Speed (img/s)', bs / batch_compute.avg, 'NA', bs /
                batch_compute.max, bs / batch_compute.min, batch_compute.count
            ] + common,
            [
                'Effective Speed (img/s)', bs / full_time.avg, 'NA',
                bs / full_time.max, bs / full_time.min, batch_compute.count
            ] + common,
            # Ignored Metric
            #  GPU timed on the CPU side (very close to GPU timing anway)
            # # ['CPU Compute Time (s)] + batch_compute.to_array() + common,

            #  https://en.wikipedia.org/wiki/Harmonic_mean
            # ['Compute Inst Speed (img/s)'] + compute_speed.to_array() + common,
            # ['Effective Inst Speed (img/s)'] + effective_speed.to_array() + common,

            # ['iowait'] + iowait.to_array() + common
        ]

        # Only some loaders support this
        # So try and print an error but do not fail
        try:
            data_reading = dataset.dataset.read_timer
            data_transform = dataset.dataset.transform_timer
            collate_time = utils.timed_fast_collate.time_stream

            if data_loading_cpu.count > 1:
                report_data += [['Prefetch CPU Data loading (s)'] +
                                data_loading_cpu.to_array() + common]
                report_data += [['Prefetch GPU Data Loading (s)'] +
                                data_loading_gpu.to_array() + common]

            report_data += [['Read Time (s)'] + data_reading.to_array() +
                            common]
            report_data += [['Transform Time (s)'] +
                            data_transform.to_array() + common]
            report_data += [[
                'Read Speed per process (img/s)', 1.0 / data_reading.avg, 'NA',
                1.0 / data_reading.max, 1.0 / data_reading.min,
                data_reading.count
            ] + common]
            report_data += [[
                'Transform Speed per process  (img/s)',
                1.0 / data_transform.avg, 'NA', 1.0 / data_transform.max,
                1.0 / data_transform.min, data_transform.count
            ] + common]

            report_data += [[
                'Read Speed (img/s)', args.workers / data_reading.avg, 'NA',
                args.workers / data_reading.max,
                args.workers / data_reading.min, data_reading.count
            ] + common]
            report_data += [[
                'Transform Speed (img/s)', args.workers / data_transform.avg,
                'NA', args.workers / data_transform.max,
                args.workers / data_transform.min, data_transform.count
            ] + common]
            report_data += [[
                'Image Aggregation Speed (img/s)', bs / collate_time.avg, 'NA',
                bs / collate_time.max, bs / collate_time.min,
                collate_time.count
            ] + common]
            report_data += [[
                'Image Aggregation Time (s)', collate_time.avg,
                collate_time.sd, collate_time.max, collate_time.min,
                collate_time.count
            ] + common]
        except Exception as e:
            print(e)

        report_data.extend(gpu_monitor.arrays(common))
        report.print_table(header, report_data, filename=args.report)

    return
예제 #17
0
 def __init__(self, collate, time_stream=StatStream(10)):
     self.collate = collate
     self.time_stream = time_stream
예제 #18
0
    from MixedPrecision.tools.utils import show_args
    import argparse

    #main()

    print('Batch Size,	Workers,	Average (s),	SD (s),	Min (s),	Max (s),	Count')
    for w in (0, 1, 2, 4, 8):
        for b in (32, 64, 128, 256):
            # Create a new args that is usable by our data loader
            args = argparse.Namespace(
                data='/home/user1/test_database/imgnet/ImageNet.hdf5',
                workers=w,
                batch_size=b
            )
            loader = hdf5_loader(args)
            load = StatStream(20)
            start = time.time()

            for index, (x, y) in enumerate(loader):
                end = time.time()
                ignore(x, y)
                load += end - start

                if index > 100:
                    break

                start = time.time()

            del loader
            print('{}, {}, {}, {}, {}, {}, {}'.format(b, w, load.avg, load.sd, load.min, load.max, load.count))