def prefetch_pytorch_loader(args, train=True, pin_memory=True): from MixedPrecision.tools.prefetcher import DataPreFetcher from MixedPrecision.tools.stats import StatStream import MixedPrecision.tools.utils as utils data_transforms = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), ]) train_dataset = TimedImageFolder(args.data, data_transforms) loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=pin_memory, collate_fn=utils.timed_fast_collate) mean = utils.enable_half( torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).float()).view(1, 3, 1, 1) std = utils.enable_half( torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).float()).view(1, 3, 1, 1) return DataPreFetcher(loader, mean=mean, std=std, cpu_stats=StatStream(drop_first_obs=10), gpu_stats=StatStream(drop_first_obs=10))
def __init__(self, stages: List[str], drop=10): self.names = stages self.stages = [StatStream(drop) for _ in stages] self.current_stage = 0 self.start_time = 0 self.end_time = 0 self.total_s = 0 self.total = StatStream(drop)
def preprocess_to_lmdb(transform, input_folder: str, output_file: str): train_dataset = torchvision.datasets.ImageFolder(input_folder, transform) n = len(train_dataset) print(output_file) env = lmdb.open(output_file, map_size=3 * 256 * 256 * n * 9) load_time = StatStream(10) save_time = StatStream(10) start = time.time() print('Converting...') for index, (x, y) in enumerate(train_dataset): end = time.time() load_time += end - start s = time.time() with env.begin(write=True, buffers=True) as txn: # convert to uint8 x = np.array(x, dtype=np.uint8) x = np.moveaxis(x, -1, 0) #datum = array_to_datum(X, y) tensor_protos = caffe2_pb2.TensorProtos() img_tensor = tensor_protos.protos.add() img_tensor.dims.extend(x.shape) img_tensor.data_type = caffe2_pb2.TensorProto.UINT8 flatten_img = x.reshape(np.prod(x.shape)) img_tensor.int32_data.extend(flatten_img) label_protos = caffe2_pb2.TensorProtos() label_tensor = tensor_protos.protos.add() label_tensor.data_type = caffe2_pb2.TensorProto.INT32 label_tensor.int32_data.append(y) txn.put('x_{}'.format(index).encode('ascii'), tensor_protos.SerializeToString()) txn.put('y_{}'.format(index).encode('ascii'), label_protos.SerializeToString()) e = time.time() save_time += e - s if index % 100 == 0 and load_time.avg > 0: print( '{:.4f} % Load[avg: {:.4f} img/s sd: {:.4f}] Save[avg: {:.4f} img/s sd: {:.4f}]' .format(index * 100 / n, 1 / load_time.avg, load_time.sd, 1 / save_time.avg, save_time.sd)) start = time.time() env.close() print('{:.4f} img/s'.format(1 / load_time.avg))
def preprocess_to_hdf5(transform, input_folder: str, output_file: str): train_dataset = torchvision.datasets.ImageFolder( input_folder, transform) output = h5py.File(output_file, 'w', libver='latest') # >>>>>> # Stores an Array of String representing Index -> class classes = output.create_dataset('classes', (1000,), dtype='S9') cls = list(train_dataset.class_to_idx.items()) cls.sort(key=lambda x: x[1]) for (key, index) in cls: classes[index] = np.string_(key) # <<<<<< n = len(train_dataset) hdy = output.create_dataset('label', (n,), dtype=np.uint8) hdx = output.create_dataset( 'data', (n, 3, 256, 256), dtype=np.uint8, chunks=(1, 3, 256, 256), # Chunk Per sample for fast retrieval compression='lzf') load_time = StatStream(10) save_time = StatStream(10) start = time.time() print('Converting...') for index, (x, y) in enumerate(train_dataset): end = time.time() load_time += end - start s = time.time() # convert to uint8 x = np.array(x, dtype=np.uint8) hdy[index] = y hdx[index] = np.moveaxis(x, -1, 0) e = time.time() save_time += e - s if index % 100 == 0 and load_time.avg > 0: print('{:.4f} % Load[avg: {:.4f} img/s sd: {:.4f}] Save[avg: {:.4f} img/s sd: {:.4f}]'.format( index * 100 / n, 1 / load_time.avg, load_time.sd, 1 / save_time.avg, save_time.sd)) start = time.time() output.close() print('{:.4f} img/s'.format(1 / load_time.avg))
def __init__(self, root, transform=None, target_transform=None, loader=pil_loader): self.zipfile = zipfile.ZipFile(root, 'r') self.loader = loader self.x_transform = transform self.y_transform = target_transform self.classes, self.classes_to_idx, self.files = self.find_classes( self.zipfile.namelist()) self._read_timer = StatStream(10) self._transform_timer = StatStream(10)
def __init__(self, work, results, loader): super().__init__(name='AsyncPrefetcherWorker') self.work = work self.results = results self.loader = loader self.stat = StatStream(10)
def main(): from MixedPrecision.tools.loaders import hdf5_loader from MixedPrecision.tools.utils import show_args import argparse parser = argparse.ArgumentParser('Image Net Preprocessor') parser.add_argument('--input', type=str, help='Input directory') parser.add_argument('--output', type=str, help='Output directory') parser.add_argument('--test-only', action='store_true', default=False, help='Do not run the preprocessor') parser.add_argument('--speed-test', action='store_true', default=False, help='Run the speed test on the created dataset') parser.add_argument('--batch-size', type=int, default=128, help='Batch size to use for the speed test') parser.add_argument('--workers', type=int, default=4, help='Number of worker to use for the speed trest') t = transforms.Compose([ transforms.Resize((256, 256)) ]) args = parser.parse_args() show_args(args) if not args.test_only: s = time.time() preprocess_to_hdf5(t, args.input, args.output) e = time.time() print('Preprocessed Dataset in {:.4f} min'.format((e - s) / 60)) if args.speed_test: print('Speed test') # Create a new args that is usable by our data loader args = argparse.Namespace( data=args.output, workers=args.workers, batch_size=args.batch_size ) loader = hdf5_loader(args) print(' - {} images available'.format(len(loader.dataset))) load = StatStream(20) start = time.time() for index, (x, y) in enumerate(loader): end = time.time() ignore(x, y) load += end - start if index > 100: break start = time.time() print(' - {:.4f} img/sec (min={:.4f}, max={:.4f})'.format( args.batch_size / load.avg, args.batch_size / load.max, args.batch_size / load.min)) print(' - {:.4f} sec (min={:.4f}, max={:.4f}, sd={:.4f})'.format( load.avg, load.min, load.max, load.sd))
def benchmark_loader(args): import time import socket from MixedPrecision.tools.stats import StatStream from MixedPrecision.tools.prefetcher import AsyncPrefetcher import MixedPrecision.tools.report as report def ignore(x, y): pass s = time.time() data = load_dataset(args) stat = StatStream(20) prof = args.prof print('Init time was {:.4f}'.format(time.time() - s)) print('Starting..') start = time.time() for j, (x, y) in enumerate(data): #x = x.cuda() #y = y.cuda() ignore(x, y) end = time.time() current_time = end - start stat += current_time if j > prof: break if stat.avg > 0: print('[{:4d}] {:.4f} (avg: {:.4f} img/s)'.format( j, args.batch_size / current_time, args.batch_size / stat.avg)) start = time.time() print('Done') hostname = socket.gethostname() #current_device = torch.cuda.current_device() #gpu = torch.cuda.get_device_name(current_device) gpu = 0 bs = args.batch_size common = [args.batch_size, args.workers, args.loader, hostname, gpu] report.print_table([ 'Metric', 'Average', 'Deviation', 'Min', 'Max', 'count', 'batch', 'workers', 'loader', 'hostname', 'GPU' ], [['Load Time (s)'] + stat.to_array() + common, [ 'Load Speed (img/s)', bs / stat.avg, 'NA', bs / stat.max, bs / stat.min, stat.count ] + common])
def train(args, dataset): shape = args.shape nbatch = args.batch_size nin = shape[0] * shape[1] * shape[2] conv_num = args.conv_num learning_rate = args.lr momentum = args.momentum loss_scale = args.static_loss_scale dtype = tf.float16 if args.half else tf.float32 tf.set_random_seed(0) np.random.seed(0) device = '/gpu:0' if args.gpu else '/cpu' # Create training graph # ------------------------------------------------------------------------------------------------------------------ with tf.device(device), \ tf.variable_scope( # Note: This forces trainable variables to be stored as float32 'fp32_storage', custom_getter=float32_variable_storage_getter): data, target, loss = create_simple_model(nbatch, shape, conv_num, 512, 10, dtype) variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # Note: Loss scaling can improve numerical stability for fp16 training grads = gradients_with_loss_scaling(loss, variables, loss_scale) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) training_step_op = optimizer.apply_gradients(zip(grads, variables)) init_op = tf.global_variables_initializer() # ------------------------------------------------------------------------------------------------------------------ # Run training sess = tf.Session(config=tf.ConfigProto(log_device_placement=args.log_device_placement)) sess.run(init_op) compute_time = StatStream(1) floss = float('inf') for epoch in range(0, args.epochs): cstart = time.time() for batch in dataset: x, y = batch floss, _ = sess.run([loss, training_step_op], feed_dict={data: x, target: y}) cend = time.time() compute_time += cend - cstart print('[{:4d}] Compute Time (avg: {:.4f}, sd: {:.4f}) Loss: {:.4f}'.format( 1 + epoch, compute_time.avg, compute_time.sd, floss))
def train(args, model, data): import time import MixedPrecision.tools.utils as utils from MixedPrecision.tools.optimizer import OptimizerAdapter from MixedPrecision.tools.stats import StatStream model = utils.enable_cuda(model) model = utils.enable_half(model) criterion = utils.enable_cuda(nn.CrossEntropyLoss()) criterion = utils.enable_half(criterion) optimizer = optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay ) optimizer = OptimizerAdapter( optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale ) model.train() compute_time = StatStream(1) floss = float('inf') for epoch in range(0, args.epochs): cstart = time.time() for batch in data: x, y = batch x = utils.enable_cuda(x) y = utils.enable_cuda(y) x = utils.enable_half(x) out = model(x) loss = criterion(out, y) floss = loss.item() optimizer.zero_grad() optimizer.backward(loss) optimizer.step() cend = time.time() compute_time += cend - cstart print('[{:4d}] Compute Time (avg: {:.4f}, sd: {:.4f}) Loss: {:.4f}'.format( 1 + epoch, compute_time.avg, compute_time.sd, floss))
def __init__(self, loader, buffering=2): self.loader = iter(loader) self.data = None self.loading_stat = StatStream(1) self.wait_time = StatStream(1) #self.manager = Manager() #self.work_queue = self.manager.Queue() #self.result_queue = self.manager.Queue() self.work_queue = multiprocessing.SimpleQueue() self.result_queue = multiprocessing.SimpleQueue() # MP implementation #self.worker = multiprocessing.Process(target=prefetch, args=(self.work_queue, self.result_queue, self.loader, self.loading_stat)) #self.worker.start() self.worker = AsyncPrefetcherWorker(self.work_queue, self.result_queue, self.loader) self.worker.start() # put n batch in advance for i in range(buffering): self.work_queue.put('next')
def __init__(self, loader, mean, std, cpu_stats=StatStream(0), gpu_stats=StatStream(0)): print('Using Prefetcher') self.loader = iter(loader) self.mean = mean self.std = std self.next_target = None self.next_input = None self.stream = self._make_stream() self.start_event = torch.cuda.Event(enable_timing=True, blocking=False, interprocess=False) self.end_event = torch.cuda.Event(enable_timing=True, blocking=False, interprocess=False) self.gpu_time = gpu_stats self.cpu_time = cpu_stats self.preload()
def __init__(self, loop_interval, device_id): self.options = [ '--format=csv', '--loop-ms=' + str(loop_interval), '--id=' + str(device_id) ] self.streams = [StatStream(drop_first_obs=2) for _ in metrics] self.n = len(metrics) self.process = None self.running = True self.dispatcher = { 'name': self.process_ignore, 'temperature.gpu': self.process_value, 'utilization.gpu': self.process_percentage, 'utilization.memory': self.process_percentage, 'memory.total': self.process_memory, 'memory.free': self.process_memory, 'memory.used': self.process_memory }
def preprocess(transform, input_folder, output_folder): train_dataset = torchvision.datasets.ImageFolder(input_folder, transform) load_time = StatStream(10) start = time.time() for index, (x, y) in enumerate(train_dataset): end = time.time() load_time += end - start class_name = train_dataset.classes[y] output_dir = '{}/{}'.format(output_folder, class_name) os.makedirs(output_dir, mode=0o755, exist_ok=True) out = '{}/{}_{}.jpeg'.format(output_dir, class_name, index) x.save(out, 'JPEG') start = time.time() print('avg: {:.4f}s sd: {:.4f} {}'.format(load_time.avg, load_time.sd, load_time.count)) print('{:.4f} img/s'.format(1 / load_time.avg))
def train(args, model, dataset, name, is_warmup=False): import time import MixedPrecision.tools.utils as utils from MixedPrecision.tools.optimizer import OptimizerAdapter from MixedPrecision.tools.stats import StatStream from MixedPrecision.tools.monitor import make_monitor model = utils.enable_cuda(model) if args.half: from apex.fp16_utils import network_to_half model = network_to_half(model) criterion = utils.enable_cuda(nn.CrossEntropyLoss()) # No Half precision for the criterion # criterion = utils.enable_half(criterion) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer = OptimizerAdapter(optimizer, half=args.half, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) model.train() epoch_compute = StatStream(drop_first_obs=10) batch_compute = StatStream(drop_first_obs=10) gpu_compute = StatStream(drop_first_obs=10) compute_speed = StatStream(drop_first_obs=10) effective_speed = StatStream(drop_first_obs=10) data_waiting = StatStream(drop_first_obs=10) data_loading_gpu = StatStream(drop_first_obs=10) data_loading_cpu = StatStream(drop_first_obs=10) full_time = StatStream(drop_first_obs=10) iowait = StatStream(drop_first_obs=10) transfert_time = StatStream(drop_first_obs=10) start_event = torch.cuda.Event(enable_timing=True, blocking=False, interprocess=False) end_event = torch.cuda.Event(enable_timing=True, blocking=False, interprocess=False) floss = float('inf') # Stop after n print when benchmarking (n * batch_count) batch print_count = 0 monitor_proc, gpu_monitor = make_monitor(loop_interval=250) def should_run(): if args.prof is None: return True return print_count < args.prof try: for epoch in range(0, args.epochs): epoch_compute_start = time.time() # Looks like it only compute for the current process and not the children data_time_start = time.time() batch_count = 0 effective_batch = 0 for index, (x, y) in enumerate(dataset): transfert_start = time.time() x = x.cuda() y = y.cuda().long() torch.cuda.synchronize() data_time_end = time.time() transfert_time += (data_time_end - transfert_start) data_waiting += (data_time_end - data_time_start) # compute output batch_compute_start = time.time() output = model(x) loss = criterion(output, y) floss = loss.item() # compute gradient and do SGD step optimizer.zero_grad() optimizer.backward(loss) optimizer.step() #print(floss) torch.cuda.synchronize() batch_compute_end = time.time() full_time += batch_compute_end - data_time_start batch_compute += batch_compute_end - batch_compute_start compute_speed += args.batch_size / (batch_compute_end - batch_compute_start) effective_speed += args.batch_size / (batch_compute_end - data_time_start) effective_batch += 1 data_time_start = time.time() batch_count += 1 if effective_batch % 10 == 0: print_count += 1 speed_avg = args.batch_size / batch_compute.avg print( '[{:4d}][{:4d}] ' 'Batch Time (avg: {batch_compute.avg:.4f}, sd: {batch_compute.sd:.4f}) ' 'Speed (avg: {speed:.4f}) ' 'Data (avg: {data_waiting.avg:.4f}, sd: {data_waiting.sd:.4f})' .format(1 + epoch, batch_count, batch_compute=batch_compute, speed=speed_avg, data_waiting=data_waiting)) epoch_compute_end = time.time() epoch_compute.update(epoch_compute_end - epoch_compute_start) if not should_run(): break if not should_run(): break finally: print('Done') gpu_monitor.stop() monitor_proc.terminate() if not is_warmup: hostname = socket.gethostname() current_device = torch.cuda.current_device() gpu = torch.cuda.get_device_name(current_device) gpu = gpu[0:min(10, len(gpu))].strip() bs = args.batch_size loader = args.loader header = [ 'Metric', 'Average', 'Deviation', 'Min', 'Max', 'count', 'half', 'batch', 'workers', 'loader', 'model', 'hostname', 'GPU' ] common = [ args.half, args.batch_size, args.workers, loader, name, hostname, gpu ] report_data = [ ['Waiting for data (s)'] + data_waiting.to_array() + common, ['GPU Compute Time (s)'] + batch_compute.to_array() + common, ['Full Batch Time (s)'] + full_time.to_array() + common, [ 'Compute Speed (img/s)', bs / batch_compute.avg, 'NA', bs / batch_compute.max, bs / batch_compute.min, batch_compute.count ] + common, [ 'Effective Speed (img/s)', bs / full_time.avg, 'NA', bs / full_time.max, bs / full_time.min, batch_compute.count ] + common, # Ignored Metric # GPU timed on the CPU side (very close to GPU timing anway) # # ['CPU Compute Time (s)] + batch_compute.to_array() + common, # https://en.wikipedia.org/wiki/Harmonic_mean # ['Compute Inst Speed (img/s)'] + compute_speed.to_array() + common, # ['Effective Inst Speed (img/s)'] + effective_speed.to_array() + common, # ['iowait'] + iowait.to_array() + common ] # Only some loaders support this # So try and print an error but do not fail try: data_reading = dataset.dataset.read_timer data_transform = dataset.dataset.transform_timer collate_time = utils.timed_fast_collate.time_stream if data_loading_cpu.count > 1: report_data += [['Prefetch CPU Data loading (s)'] + data_loading_cpu.to_array() + common] report_data += [['Prefetch GPU Data Loading (s)'] + data_loading_gpu.to_array() + common] report_data += [['Read Time (s)'] + data_reading.to_array() + common] report_data += [['Transform Time (s)'] + data_transform.to_array() + common] report_data += [[ 'Read Speed per process (img/s)', 1.0 / data_reading.avg, 'NA', 1.0 / data_reading.max, 1.0 / data_reading.min, data_reading.count ] + common] report_data += [[ 'Transform Speed per process (img/s)', 1.0 / data_transform.avg, 'NA', 1.0 / data_transform.max, 1.0 / data_transform.min, data_transform.count ] + common] report_data += [[ 'Read Speed (img/s)', args.workers / data_reading.avg, 'NA', args.workers / data_reading.max, args.workers / data_reading.min, data_reading.count ] + common] report_data += [[ 'Transform Speed (img/s)', args.workers / data_transform.avg, 'NA', args.workers / data_transform.max, args.workers / data_transform.min, data_transform.count ] + common] report_data += [[ 'Image Aggregation Speed (img/s)', bs / collate_time.avg, 'NA', bs / collate_time.max, bs / collate_time.min, collate_time.count ] + common] report_data += [[ 'Image Aggregation Time (s)', collate_time.avg, collate_time.sd, collate_time.max, collate_time.min, collate_time.count ] + common] except Exception as e: print(e) report_data.extend(gpu_monitor.arrays(common)) report.print_table(header, report_data, filename=args.report) return
def __init__(self, collate, time_stream=StatStream(10)): self.collate = collate self.time_stream = time_stream
from MixedPrecision.tools.utils import show_args import argparse #main() print('Batch Size, Workers, Average (s), SD (s), Min (s), Max (s), Count') for w in (0, 1, 2, 4, 8): for b in (32, 64, 128, 256): # Create a new args that is usable by our data loader args = argparse.Namespace( data='/home/user1/test_database/imgnet/ImageNet.hdf5', workers=w, batch_size=b ) loader = hdf5_loader(args) load = StatStream(20) start = time.time() for index, (x, y) in enumerate(loader): end = time.time() ignore(x, y) load += end - start if index > 100: break start = time.time() del loader print('{}, {}, {}, {}, {}, {}, {}'.format(b, w, load.avg, load.sd, load.min, load.max, load.count))