def __init__(self, with_val=True): meters = { 'config': ValueMeter(skip_reset=True), 'host_info': ValueMeter(skip_reset=True), 'epoch': ValueMeter(), 'data_load_time': MeanValueMeter(), 'data_transfer_time': MeanValueMeter(), 'forward_time': MeanValueMeter(), 'backward_time': MeanValueMeter(), 'optim_time': MeanValueMeter(), 'eval_time': MeanValueMeter(), 'train_loss': MeanValueMeter(), 'train_mpjpe': MeanValueMeter(), 'train_pck': MeanValueMeter(), 'train_examples': ValueMeter(), } if with_val: meters.update({ 'val_loss': MeanValueMeter(), 'val_mpjpe': MeanValueMeter(), 'val_pck': MeanValueMeter(), 'val_examples': ValueMeter(), }) self.with_val = with_val self.telemetry = tele.Telemetry(meters)
def __init__(self, train_eval, val_eval): self.telemetry = tele.Telemetry({ 'experiment_id': tele.meter.ValueMeter(skip_reset=True), 'epoch': tele.meter.ValueMeter(), 'train_loss': torchnet.meter.AverageValueMeter(), 'val_loss': torchnet.meter.AverageValueMeter(), 'epoch_time': torchnet.meter.TimeMeter(unit=False), 'train_data_load_time': torchnet.meter.AverageValueMeter(), 'train_data_transfer_time': torchnet.meter.AverageValueMeter(), 'train_forward_time': torchnet.meter.AverageValueMeter(), 'train_criterion_time': torchnet.meter.AverageValueMeter(), 'train_backward_time': torchnet.meter.AverageValueMeter(), 'train_optim_time': torchnet.meter.AverageValueMeter(), 'train_eval_time': torchnet.meter.AverageValueMeter(), 'train_sample': tele.meter.ValueMeter(), 'val_sample': tele.meter.ValueMeter(), 'train_heatmaps': tele.meter.ValueMeter(), 'val_heatmaps': tele.meter.ValueMeter(), 'args': tele.meter.ValueMeter(skip_reset=True), 'train_pckh_total': train_eval.meters['total_mpii'], 'val_pckh_total': val_eval.meters['total_mpii'], 'train_pckh_all': train_eval.meters['all'], 'val_pckh_all': val_eval.meters['all'], 'val_preds': tele.meter.ValueMeter(), 'best_val_preds': tele.meter.ValueMeter(skip_reset=True), 'model_graph': tele.meter.ValueMeter(skip_reset=True), })
def main(): args = parse_args() seed_random_number_generators(args.seed) model_desc = { 'base': args.base_model, 'dilate': args.dilate, 'truncate': args.truncate, 'output_strat': args.output_strat, 'preact': args.preact, 'reg': args.reg, 'reg_coeff': args.reg_coeff, 'hm_sigma': args.hm_sigma, } model = build_mpii_pose_model(**model_desc) model.cuda() train_data = MPIIDataset('/datasets/mpii', 'train', use_aug=True, image_specs=model.image_specs) sampler = make_data_sampler(args.max_iters * args.batch_size, len(train_data)) train_loader = DataLoader(train_data, args.batch_size, num_workers=4, drop_last=True, sampler=sampler) data_iter = iter(train_loader) print(json.dumps(model_desc, sort_keys=True, indent=2)) def do_training_iteration(optimiser): batch = next(data_iter) in_var = Variable(batch['input'].cuda(), requires_grad=False) target_var = Variable(batch['part_coords'].cuda(), requires_grad=False) mask_var = Variable(batch['part_mask'].type(torch.cuda.FloatTensor), requires_grad=False) # Calculate predictions and loss out_var = model(in_var) loss = model.forward_loss(out_var, target_var, mask_var) # Calculate gradients optimiser.zero_grad() loss.backward() # Update parameters optimiser.step() return loss.data[0] optimiser = SGD(model.parameters(), lr=1, weight_decay=args.weight_decay, momentum=args.momentum) tel = tele.Telemetry({ 'cli_args': ValueMeter(skip_reset=True), 'loss_lr': ValueMeter(), }) tel['cli_args'].set_value(vars(args)) if args.showoff: client = pyshowoff.Client('http://' + args.showoff) notebook = client.add_notebook( 'Hyperparameter search ({}-d{}-t{}, {}, reg={})'.format( args.base_model, args.dilate, args.truncate, args.output_strat, args.reg)).result() tel.sink(tele.showoff.Conf(notebook), [ Inspect(['cli_args'], 'CLI arguments', flatten=True), XYGraph(['loss_lr'], 'Loss vs learning rate graph'), ]) lrs = np.geomspace(args.lr_min, args.lr_max, args.max_iters) avg_loss = 0 min_loss = np.inf for i, lr in enumerate(tqdm(lrs, ascii=True)): for param_group in optimiser.param_groups: param_group['lr'] = lr loss = do_training_iteration(optimiser) avg_loss = args.ema_beta * avg_loss + (1 - args.ema_beta) * loss smoothed_loss = avg_loss / (1 - args.ema_beta**(i + 1)) if min_loss > 0 and smoothed_loss > 4 * min_loss: break min_loss = min(smoothed_loss, min_loss) tel['loss_lr'].set_value((lr, smoothed_loss)) tel.step()
def sacred_main(_run: Run, seed, showoff, batch_size, model_desc, deterministic, train_datasets, lr_min, lr_max, max_iters, ema_beta, weight_decay, momentum): seed_all(seed) init_algorithms(deterministic=deterministic) model = create_model(model_desc).to(global_opts['device']) data_loader = create_train_dataloader(train_datasets, model.data_specs, batch_size, examples_per_epoch=(max_iters * batch_size)) data_iter = iter(data_loader) print(json.dumps(model_desc, sort_keys=True, indent=2)) def do_training_iteration(optimiser): batch = next(data_iter) in_var = batch['input'].to(global_opts['device'], torch.float32) target_var = batch['target'].to(global_opts['device'], torch.float32) mask_var = batch['joint_mask'].to(global_opts['device'], torch.float32) # Calculate predictions and loss out_var = model(in_var) loss = forward_loss(model, out_var, target_var, mask_var, batch['valid_depth']) # Calculate gradients optimiser.zero_grad() loss.backward() # Update parameters optimiser.step() return loss.item() optimiser = SGD(model.parameters(), lr=1, weight_decay=weight_decay, momentum=momentum) tel = tele.Telemetry({ 'config': ValueMeter(skip_reset=True), 'host_info': ValueMeter(skip_reset=True), 'loss_lr_fig': ValueMeter(), }) notebook = None if showoff: title = 'Hyperparameter search ({}@{})'.format(model_desc['type'], model_desc['version']) notebook = create_showoff_notebook(title, ['lrfinder']) from tele.showoff import views tel.sink(tele.showoff.Conf(notebook), [ views.Inspect(['config'], 'Experiment configuration', flatten=True), views.Inspect(['host_info'], 'Host information', flatten=True), views.FrameContent(['loss_lr_fig'], 'Loss vs learning rate graph', 'plotly'), ]) def set_progress(value): if notebook is not None: notebook.set_progress(value) tel['config'].set_value(_run.config) tel['host_info'].set_value(get_host_info()) lrs = np.geomspace(lr_min, lr_max, max_iters) losses = [] avg_loss = 0 min_loss = np.inf for i, lr in enumerate(tqdm(lrs, ascii=True)): set_progress(i / len(lrs)) for param_group in optimiser.param_groups: param_group['lr'] = lr loss = do_training_iteration(optimiser) avg_loss = ema_beta * avg_loss + (1 - ema_beta) * loss smoothed_loss = avg_loss / (1 - ema_beta ** (i + 1)) if min_loss > 0 and smoothed_loss > 4 * min_loss: break min_loss = min(smoothed_loss, min_loss) losses.append(smoothed_loss) if i % 10 == 0: fig = go.Figure( data=[go.Scatter(x=lrs[:len(losses)].tolist(), y=losses, mode='lines')], layout=go.Layout( margin=go.Margin(l=60, r=40, b=80, t=20, pad=4), xaxis=go.XAxis(title='Learning rate', type='log', exponentformat='power'), yaxis=go.YAxis(title='Training loss'), ) ) tel['loss_lr_fig'].set_value(fig) tel.step() set_progress(1)