def get_arguments(self, parser, *args, **kwargs): self.args = parser.get_arguments() self._chrono = MultiStageChrono(name=self.name, skip_obs=self.skip_obs, sync=get_sync(self.args)) return self.args
def run_check(spec, repeat=10, number=20, report_name=None): chrono = MultiStageChrono(skip_obs=2) args = spec['args'] input_gen = spec['inputs'] algos = spec['algos'] batch_sizes = spec['batch_size'] get_output_layer = spec['get_output_layer'] get_output_size = spec['get_output_size'] for algo, tensor_sizes in algos: for arg in args: # initialize the conv layer that we will benchmark layer = algo(**arg).cuda() for batch_size in batch_sizes: for tensor_size in tensor_sizes: name = f'algo={algo.__name__},batch={batch_size},tensor={tensor_size},arg={arg}' print(name) try: input = input_gen(layer, batch_size, tensor_size) target = None size = None criterion = nn.MSELoss() # Benchmark the layer for i in range(0, repeat): # --- with chrono.time(name) as timer: for _ in range(0, number): out = layer(*input) out = get_output_layer(*out) if target is None: if get_output_size is None: size = reduce(mul, out.shape[1:]) else: size = get_output_size(out.shape) target = torch.randn(batch_size, size).cuda() loss = criterion(target, out.view(-1, size)) loss.backward() torch.cuda.synchronize() print(f' Ran in {timer.avg:5.2f}s {timer.avg * repeat:5.2f}s') # --- except Exception as e: print(f'[!] > {e}') print(traceback.format_exc()) report = chrono.to_json(indent=2) print(report) if report_name is not None: json.dump(report, open(report_name, 'w'), indent=2)
def test_cnn_base(): # torch.Size([2, 4, 84, 84]) device = torch.device('cuda') print(torch.cuda.get_device_name(device)) print(CNNBase((4, 84, 84)[0], hidden_size=512)) prev = None for batch_size in [2, 4, 8, 16, 32, 64, 128]: chrono = MultiStageChrono() repeat = 30 exp = f'forward_{batch_size}' input = torch.rand(batch_size, 4, 84, 84).cuda() net = CNNBase((4, 84, 84)[0], hidden_size=512) net.cuda() for _ in range(0, 10): with chrono.time(exp): for _ in range(0, repeat): net(input, None, None) torch.cuda.synchronize() for _ in range(0, 30): with chrono.time(exp): for _ in range(0, repeat): net(input, None, None) torch.cuda.synchronize() speed = batch_size * repeat / chrono.chronos[exp].avg speed_up = '' if prev: speed_up = f'Speed up x{speed / prev:7.4f}' print(f'{exp:>30} {speed:12,.4f} item/sec {speed_up}') prev = speed
def train(models, epochs, dataset, olr, lr_reset_threshold=1e-05, output_name='/tmp/', device_name='gpu'): device = torch.device(device_name) train_loader = torch.utils.data.DataLoader(batch_size=64, shuffle=True, num_workers=4, dataset=dataset) if torch.cuda.is_available(): nd = torch.cuda.device_count() devices = [ torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count()) ] else: nd = 1 devices = [torch.device('cpu')] dataset_size = len(train_loader) models_optim = {} for name, model in models.items(): model = model.to(device) optimizer = WindowedSGD(model.parameters(), epoch_steps=dataset_size, window=dataset_size, lr_min=lr_reset_threshold, lr=olr) model.train() models_optim[name] = (model, optimizer) epoch_time = MultiStageChrono(name='train', skip_obs=10) costs = [] print('Start training') for e in range(0, epochs): all_cost = [0] * len(models_optim) with epoch_time.time('epoch') as step_time: for batch_idx, (data, target) in enumerate(train_loader): with epoch_time.time('models'): for mid, (name, (model, optimizer)) in enumerate(models_optim.items()): device = devices[mid % nd] if torch.cuda.is_available(): torch.cuda.set_device(device) # g1, torch.float32, True, False) data = data.to(device, torch.float, True, True) target = target.to(device, torch.long, True, True) model = model.to(device) with epoch_time.time(model): optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() all_cost[mid] += loss.item() optimizer.step(loss) if torch.cuda.is_available(): torch.cuda.synchronize() # --- # --- # --- # --- with epoch_time.time('check_point'): for name, (model, _) in models_optim.items(): torch.save(model.state_dict(), f'{output_name}/{name}_{e}') infos = [ f'{all_cost[idx]:8.2f}, {models_optim[name][1].lr:10.8f}' for idx, name in enumerate(models_optim) ] print(f'{e:3d}/{epochs:3d}, {step_time.val:6.2f}, ' + ', '.join(infos)) costs.append(all_cost) print(epoch_time.to_json()) return costs
def main(): # ---- parser = argparse.ArgumentParser() parser.add_argument('--data', metavar='DIR', help='path to dataset') parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18') parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, metavar='LR') parser.add_argument('--opt-level', type=str) parser.add_argument('--cuda', action='store_true', default=True, dest='cuda') parser.add_argument('--no-cuda', action='store_false', dest='cuda') parser.add_argument('--batch-size', type=int, default=128) parser.add_argument('--loader', type=str, default='torch') parser.add_argument('--prof', type=int, default=None) parser.add_argument('--workers', type=int, default=4) parser.add_argument('--seed', type=int, default=4) parser.add_argument('--epochs', type=int, default=4) parser.add_argument('--sync-all', type=bool, default=False) args = parser.parse_args() chrono = MultiStageChrono(skip_obs=10, sync=None) device = torch.device('cpu') if torch.cuda.is_available() and args.cuda: device = torch.device('cuda') torch.set_num_threads(args.workers) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # -- try: import torch.backends.cudnn as cudnn cudnn.benchmark = True except ImportError: pass # ---- model = models.__dict__[args.arch]() model = model.to(device) criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), args.lr) # # ---- # model, optimizer = amp.initialize( # model, # optimizer, # enabled=args.opt_level != 'O0', # cast_model_type=None, # patch_torch_functions=True, # keep_batchnorm_fp32=None, # master_weights=None, # loss_scale="dynamic", # opt_level=args.opt_level # ) # ---- train_loader = loaders.load_dataset(args, train=True) # dataset is reduced but should be big enough for benchmark! batch_iter = iter(train_loader) def next_batch(iterator): try: return next(iterator), iterator except StopIteration: iterator = iter(train_loader) return next(iterator), iterator batch_count = len(train_loader) if args.prof is not None: batch_count = args.prof sync_fun = lambda: torch.cuda.current_stream().synchronize() sub_syncs = None if args.sync_all: sub_syncs = sync_fun print('Computing...') model.train() for epoch in range(args.epochs): # we sync after batch_count to not slowdown things with chrono.time('train', skip_obs=1, sync=sync_fun) as timer: for _ in range(batch_count): # data loading do not start here so naturally this is not data loading # only the time waiting for the data loading to finish with chrono.time('loading', sync=sub_syncs): (input, target), batch_iter = next_batch(batch_iter) input = input.to(device) target = target.to(device) # if we do not synchronize we only get cuda `launch time` # not the actual compute with chrono.time('compute', sync=sub_syncs): output = model(input) loss = criterion(output, target) # compute gradient and do SGD step optimizer.zero_grad() # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() loss.backward() optimizer.step() print( f'[{epoch:3d}/{args.epochs:3d}] ETA: {(args.epochs - epoch - 1) * timer.avg:6.2f} sec' ) print('--') print(chrono.to_json(indent=2)) print('--') print( f'{(args.batch_size * batch_count) / chrono.chronos["train"].avg:6.2f} Img/sec' ) print('-' * 25)
def make_report(chrono: MultiStageChrono, args: Namespace, version: str, batch_loss: RingBuffer, epoch_loss: RingBuffer, metrics, remote_logger): if args is not None: args = args.__dict__ else: args = {} if args['report'] is None: args['report'] = os.environ.get('REPORT_PATH') filename = args['report'] # Each GPU has its report we will consolidate later filename = f'{filename}_{args["jr_id"]}.json' args['version'] = version unique_id = hashlib.sha256() # make it deterministic items = list(args.items()) items.sort() for k, w in items: if k in not_parameter: continue unique_id.update(str(k).encode('utf-8')) unique_id.update(str(w).encode('utf-8')) # we do not want people do modify our shit if the id do not match then they get disqualified args['unique_id'] = unique_id.hexdigest() # Try to identify vendors so we can find them more easily if args['cuda']: args['gpu'] = get_gpu_name() args['hostname'] = socket.gethostname() args['batch_loss'] = batch_loss.to_list() args['epoch_loss'] = epoch_loss.to_list() args['metrics'] = metrics for excluded in excluded_arguments: args.pop(excluded, None) remote_logger.log_parameters(args) report_dict = chrono.to_dict(args) # train is the default name for batched stuff if 'train' in report_dict: train_data = report_dict['train'] item_count = report_dict['batch_size'] * report_dict['number'] min_item = item_count / train_data['max'] max_item = item_count / train_data['min'] train_item = { 'avg': item_count / train_data['avg'], 'max': max_item, 'min': min_item, 'range': max_item - min_item, 'unit': 'items/sec' } report_dict['train_item'] = train_item print('-' * 80) json_report = json.dumps(report_dict, sort_keys=True, indent=4, separators=(',', ': ')) print(json_report) if not os.path.exists(filename): report_file = open(filename, 'w') report_file.write('[') report_file.close() report_file = open(filename, 'a') report_file.write(json_report) report_file.write(',') report_file.close() print('-' * 80)
import torch import torch.nn as nn import torch.optim as optim from benchutils.chrono import show_eta, MultiStageChrono, time_this from a2c_ppo_acktr.distributions import Categorical, DiagGaussian, Bernoulli from a2c_ppo_acktr.utils import init class Flatten(nn.Module): def forward(self, x): return x.view(x.size(0), -1) chrono = MultiStageChrono() class CNNBase(nn.Module): def __init__(self, num_inputs, hidden_size=512): super(CNNBase, self).__init__() self._hidden_size = hidden_size init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), nn.init.calculate_gain('relu')) self.main = nn.Sequential( init_(nn.Conv2d(num_inputs, 32, 8, stride=4)), nn.ReLU(), init_(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(), init_(nn.Conv2d(64, 32, 3, stride=1)), nn.ReLU(), Flatten(), init_(nn.Linear(32 * 7 * 7, hidden_size)), nn.ReLU())
import torch import torch.nn as nn import torch.optim as optim from benchutils.chrono import show_eta, MultiStageChrono, time_this from a2c_ppo_acktr.utils import init class Flatten(nn.Module): def forward(self, x): return x.view(x.size(0), -1) chrono = MultiStageChrono() class CNNBase(nn.Module): def __init__(self, num_inputs, hidden_size=512): super(CNNBase, self).__init__() self._hidden_size = hidden_size init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), nn.init.calculate_gain('relu')) self.main = nn.Sequential( init_(nn.Conv2d(num_inputs, 32, 8, stride=4)), nn.ReLU(), init_(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(), init_(nn.Conv2d(64, 32, 3, stride=1)), nn.ReLU(), Flatten(), init_(nn.Linear(32 * 7 * 7, hidden_size)), nn.ReLU())
class Experiment: """ Store all the information we care about during an experiment chrono : Performance timer args : argument passed to the script name : name of the experiment batch_loss_buffer : Batch loss values (just making sure we do not get NaNs epoch_loss_buffer: Epoch loss values same as batch loss """ def __init__(self, module, skip_obs=10): self.name, self.version = get_experience_descriptor(module) self._chrono = None self.skip_obs = skip_obs self.args = None self.batch_loss_buffer = RingBuffer(100, torch.float32) self.epoch_loss_buffer = RingBuffer(10, torch.float32) self.metrics = {} self.remote_logger = None try: self.remote_logger = make_log( api_key=os.environ.get("CML_API_KEY"), project_name=self.name, workspace=os.environ.get("CML_WORKSPACE")) except Exception as e: print(e) self.remote_logger = make_log() def get_arguments(self, parser, *args, **kwargs): self.args = parser.get_arguments() self._chrono = MultiStageChrono(name=self.name, skip_obs=self.skip_obs, sync=get_sync(self.args)) return self.args def chrono(self): return self._chrono def log_batch_loss(self, val): self.batch_loss_buffer.append(val) self.remote_logger.log_metric('batch_loss', val) def log_epoch_loss(self, val): self.epoch_loss_buffer.append(val) self.remote_logger.log_metric('epoch_loss', val) def log_metric(self, name, val): metric = self.metrics.get(name) if metric is None: metric = RingBuffer(10, torch.float32) self.metrics[name] = metric metric.append(val) self.remote_logger.log_metric(name, val) def report(self): if self.args is not None: args = self.args.__dict__ else: args = {} if args['report'] is None: args['report'] = os.environ.get('REPORT_PATH') filename = args['report'] # Each GPU has its report we will consolidate later filename = f'{filename}_{args["jr_id"]}.json' args['version'] = self.version unique_id = hashlib.sha256() # make it deterministic items = list(args.items()) items.sort() for k, w in items: if k in not_parameter: continue unique_id.update(str(k).encode('utf-8')) unique_id.update(str(w).encode('utf-8')) # we do not want people do modify our shit if the id do not match then they get disqualified args['unique_id'] = unique_id.hexdigest() # Try to identify vendors so we can find them more easily if args['cuda']: args['gpu'] = get_gpu_name() args['hostname'] = socket.gethostname() args['batch_loss'] = self.batch_loss_buffer.to_list() args['epoch_loss'] = self.epoch_loss_buffer.to_list() args['metrics'] = self.metrics for excluded in excluded_arguments: args.pop(excluded, None) self.remote_logger.log_parameters(args) report_dict = self._chrono.to_dict(args) # train is the default name for bathed stuff if 'train' in report_dict: train_data = report_dict['train'] item_count = report_dict['batch_size'] * report_dict['number'] min_item = item_count / train_data['max'] max_item = item_count / train_data['min'] train_item = { 'avg': item_count / train_data['avg'], 'max': max_item, 'min': min_item, 'range': max_item - min_item, 'unit': 'items/sec' } report_dict['train_item'] = train_item print('-' * 80) json_report = json.dumps(report_dict, sort_keys=True, indent=4, separators=(',', ': ')) print(json_report) if not os.path.exists(filename): report_file = open(filename, 'w') report_file.write('[') report_file.close() report_file = open(filename, 'a') report_file.write(json_report) report_file.write(',') report_file.close() print('-' * 80) def get_device(self): return init_torch(self.args) def get_time(self, stream): if stream.avg == 0: return stream.val return stream.avg def show_eta(self, epoch_id, timer, msg=''): if msg: msg = ' | ' + msg loss = self.batch_loss_buffer.last() if loss is not None: loss = f'| Batch Loss {loss:8.4f}' else: loss = '' print( f'[{epoch_id:3d}/{self.args.repeat:3d}] ' f'| ETA: {self.get_time(timer) * (self.args.repeat - (epoch_id + 1)) / 60:6.2f} min ' + loss + msg)