def test_horovod_allreduce_multi_gpu(self): """Test that the allreduce works on multiple GPUs.""" # Only do this test if there are GPUs available. if not torch.cuda.is_available(): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() iter = 0 dtypes = [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): iter += 1 torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) device = local_rank * 2 + (iter + local_rank) % 2 tensor = tensor.cuda(device).type(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False) max_difference = tensor.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
parser.add_argument('--distill_loss_alpha', type=float, default=2.) # multi-res training parser.add_argument('--n_res', type=int, default=1, help='number of resolutions to support for training') parser.add_argument('--n_sampled_res', type=int, default=1, help='number of resolutions to sample per iter') # adaptive channel training parser.add_argument('--dynamic_channel', action='store_true', default=False) parser.add_argument('--dynamic_channel_mode', type=str, default='uniform') parser.add_argument('--sort_pretrain', action='store_true', default=False) parser.add_argument('--conditioned_d', action='store_true', default=False, help='D is conditioned on G') parser.add_argument('--min_channel', type=int, default=8) parser.add_argument('--divided_by', type=int, default=4) args = parser.parse_args() hvd.init() torch.cuda.set_device(hvd.local_rank()) # save memory when using dynamic channel # cudnn benchmark will lead to bug when computing G regularization, resulting in extremely slow computing cudnn.benchmark = (not args.dynamic_channel) and args.g_reg_every <= 0 assert args.job is not None if hvd.rank() == 0: print(' * JOB:', args.job) # make log dirs os.makedirs(log_dir, exist_ok=True) os.makedirs(checkpoint_dir, exist_ok=True) os.makedirs(os.path.join(checkpoint_dir, args.job), exist_ok=True) log_writer = SummaryWriter(os.path.join(log_dir, args.job)) if hvd.rank() == 0 else None if hvd.rank() == 0: # save args with open(os.path.join(log_dir, args.job, 'args.txt'), 'w') as f:
def sync_horovod(self): if self.use_horovod: hvd.join(hvd.local_rank() if self.on_gpu else -1)
# set random seed random.seed(config['params']['random_seed']) np.random.seed(config['params']['random_seed']) torch.manual_seed(config['params']['random_seed']) # variables best_valid_loss = float('inf') global_iter_train = 0 global_iter_valid = 0 # cuda if torch.cuda.is_available() and not config['cuda']['using_cuda']: print("WARNING: You have a CUDA device, so you should probably run with using cuda") cuda_str = 'cuda:' + str(hvd.local_rank()) device = torch.device(cuda_str if config['cuda']['using_cuda'] else "cpu") if config['cuda']['using_cuda']: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(config['params']['random_seed']) # Horovod: limit # of CPU threads to be used per worker. # torch.set_num_threads(num_workers) # tensorboard if hvd.rank() == 0: summary_writer = SummaryWriter(os.path.join(config['model']['exp_path'], 'log')) # Data target_classes = utils.read_txt(config['params']['classes'])
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) ans2label = json.load( open(f"{dirname(abspath(__file__))}" f"/utils/ans2label.json")) label2ans = {label: ans for ans, label in ans2label.items()} # load DBs and image dirs all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) # train LOGGER.info(f"Loading Train Dataset " f"{opts.train_txt_dbs}, {opts.train_img_dbs}") train_datasets = [] for txt_path, img_path in zip(opts.train_txt_dbs, opts.train_img_dbs): img_db = all_img_dbs[img_path] txt_db = TxtTokLmdb(txt_path, opts.max_txt_len) train_datasets.append(VqaDataset(len(ans2label), txt_db, img_db)) train_dataset = ConcatDatasetWithLens(train_datasets) train_dataloader = build_dataloader(train_dataset, vqa_collate, True, opts) # val LOGGER.info(f"Loading Train Dataset {opts.val_txt_db}, {opts.val_img_db}") val_img_db = all_img_dbs[opts.val_img_db] val_txt_db = TxtTokLmdb(opts.val_txt_db, -1) val_dataset = VqaEvalDataset(len(ans2label), val_txt_db, val_img_db) val_dataloader = build_dataloader(val_dataset, vqa_eval_collate, False, opts) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} all_dbs = opts.train_txt_dbs + [opts.val_txt_db] toker = json.load(open(f"{all_dbs[0]}/meta.json"))["bert"] assert all(toker == json.load(open(f"{db}/meta.json"))["bert"] for db in all_dbs) model = UniterForVisualQuestionAnswering.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM, num_answer=len(ans2label)) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level="O2") global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, "log")) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, "ckpt")) json.dump(ans2label, open(join(opts.output_dir, "ckpt", "ans2label.json"), "w")) os.makedirs(join(opts.output_dir, "results")) # store VQA predictions add_log_to_file(join(opts.output_dir, "log", "log.txt")) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataset) * hvd.size()) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter("loss") model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(train_dataloader): n_examples += batch["input_ids"].size(0) loss = model(batch, compute_loss=True) loss = loss.mean() * batch["targets"].size(1) # instance-leval bce delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i == 0 or i == 1: param_group["lr"] = lr_this_step * opts.lr_mul elif i == 2 or i == 3: param_group["lr"] = lr_this_step else: raise ValueError() TB_LOGGER.add_scalar("lr", lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar("loss", running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar("grad_norm", grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f"============Step {global_step}=============") tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f"{tot_ex} examples trained at " f"{ex_per_sec} ex/s") TB_LOGGER.add_scalar("perf/ex_per_s", ex_per_sec, global_step) LOGGER.info(f"===========================================") if global_step % opts.valid_steps == 0: val_log, results = validate(model, val_dataloader, label2ans) with open( f"{opts.output_dir}/results/" f"results_{global_step}_" f"rank{rank}.json", "w", ) as f: json.dump(results, f) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"finished {n_epoch} epochs") if opts.num_train_steps % opts.valid_steps != 0: val_log, results = validate(model, val_dataloader, label2ans) with open( f"{opts.output_dir}/results/" f"results_{global_step}_" f"rank{rank}.json", "w", ) as f: json.dump(results, f) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step)
def train_main(args, filenames): # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if torch.cuda.is_available() and not args.no_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) rank = hvd.rank() train_dataset = create_dataset( filenames, batch_size=args.batch_size, rank=rank, num_epochs=args.epochs, world_size=hvd.size(), num_reducers=args.num_reducers, max_concurrent_epochs=args.max_concurrent_epochs) model = Net() # By default, Adasum doesn"t need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if torch.cuda.is_available() and not args.no_cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = (hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average, gradient_predivide_factor=args.gradient_predivide_factor) def _train(epoch): model.train() # Horovod: set epoch to sampler for shuffling. train_dataset.set_epoch(epoch) start_epoch = timeit.default_timer() last_batch_time = start_epoch batch_wait_times = [] for batch_idx, (data, target) in enumerate(train_dataset): batch_wait_times.append(timeit.default_timer() - last_batch_time) if torch.cuda.is_available() and not args.no_cuda: if isinstance(data, list): data = [t.cuda() for t in data] target = target.cuda() optimizer.zero_grad() # output = model(data) if batch_idx % args.log_interval == 0: print( f"Processing batch {batch_idx} in epoch {epoch} on worker " f"{rank}.") time.sleep(args.mock_train_step_time) # TODO(Clark): Add worker synchronization barrier here. # loss = F.nll_loss(output, target) # loss.backward() # optimizer.step() last_batch_time = timeit.default_timer() epoch_duration = timeit.default_timer() - start_epoch avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print(f"\nEpoch {epoch}, worker {rank} stats over " f"{len(batch_wait_times)} steps: {epoch_duration:.3f}") print(f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}") print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s") return batch_wait_times print(f"Starting training on worker {rank}.") batch_wait_times = [] for epoch in range(args.epochs): # TODO(Clark): Don't include stats from first epoch since we already # expect that epoch to be cold? batch_wait_times.extend(_train(epoch)) print(f"Done training on worker {rank}.") avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print(f"\nWorker {rank} training stats over {args.epochs} epochs:") print(f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}") print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s") # TODO(Clark): Add logic to the dataset abstraction so we don't have to do # this. if rank == 0: print("Waiting in rank 0 worker to let other workers consume queue...") time.sleep(10) print("Done waiting in rank 0 worker.")
help='random seed (default: 42)') parser.add_argument('--fp16-allreduce', action='store_true', default=False, help='use fp16 compression during allreduce') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) verbose = True if hvd.rank() == 0 else False if args.cuda: torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True args.log_dir = os.path.join( args.log_dir, "cifar10_{}_kfac{}_gpu_{}_{}".format( args.model, args.kfac_update_freq, hvd.size(), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) os.makedirs(args.log_dir, exist_ok=True) log_writer = SummaryWriter(args.log_dir) if verbose else None # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(4) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
def train(serialized_model, optimizer_cls, model_opt_state_serialized, train_rows, val_rows, avg_row_size): from petastorm import TransformSpec, make_reader, make_batch_reader from petastorm.pytorch import BatchedDataLoader import torch import horovod.torch as hvd # Deserializing objects model_opt_state = torch.load(model_opt_state_serialized) model = deserialize(serialized_model) if loss_fns_pre_train: loss_fns = loss_fns_pre_train if loss_constructors: local_vars = locals() loss_fns = [loss_constructor(**local_vars) for loss_constructor in loss_constructors] # Horovod: initialize library. hvd.init() if not user_shuffle_buffer_size: shuffle_buffer_size = \ calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size()) else: shuffle_buffer_size = user_shuffle_buffer_size cuda_available = torch.cuda.is_available() if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device(_get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of # objects as their identity and therefore it cannot be serialized and then # deserialized. The deserialized optimizer object stores the names of the parameters # with their old memory addresses but in reality those are different than the # reconstructed deserialized object and that creates problem. # Learning rate is a required parameters in SGD optimizer. It will be overridden with # load_state_dict. optimizer = optimizer_cls(model.parameters(), lr=1) optimizer_state = model_opt_state['optimizer'] if last_checkpoint_state is not None: model.load_state_dict(last_checkpoint_state['model']) optimizer.load_state_dict(last_checkpoint_state['optimizer']) else: # scale the learning rate with the number of horovod workers for i in range(len(optimizer_state['param_groups'])): optimizer_state['param_groups'][i]['lr'] = \ optimizer_state['param_groups'][i]['lr'] * hvd.size() optimizer.load_state_dict(optimizer_state) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for group in optimizer.param_groups: for p in group['params']: if id(p) not in optimizer.state_dict()['state']: p.grad = p.data.new(p.size()).zero_() optimizer.step() hvd.broadcast_optimizer_state(optimizer, root_rank=0) dist_optimizer_args = dict(optimizer=optimizer, named_parameters=model.named_parameters()) if gradient_compression: # Pass the compression arg only if it is specified by the user. dist_optimizer_args['compression'] = gradient_compression # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(**dist_optimizer_args) # This function takes the current optimizer and constructs a new optimizer with the # same state except with learning rate scaled down with the number of horovod workers. # This is important the retraining of the model. User may retrain the model with # different number of workers and we need the raw learning rate to adjust with the # new number of workers. transform_spec = None if transformation: transform_spec = TransformSpec(transformation) schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) if train_steps_per_epoch is None: steps_per_epoch = int(math.ceil(float(train_rows) / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch with remote_store.get_local_output_dir() as run_output_dir: logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) def save_checkpoint(): model.cpu() optimizer_with_scaled_down_lr = \ get_optimizer_with_unscaled_lr(hvd, optimizer, optimizer_cls, model) state = { 'model': model.state_dict(), 'optimizer': optimizer_with_scaled_down_lr.state_dict(), } torch.save(state, ckpt_file) if cuda_available: model.cuda() # In general, make_batch_reader is faster than make_reader for reading the dataset. # However, we found out that make_reader performs data transformations much faster than # make_batch_reader with parallel worker processes. Therefore, the default reader # we choose is make_batch_reader unless there are data transformations. reader_factory = None reader_factory_kwargs = dict() if transform_spec: reader_factory = make_reader reader_factory_kwargs['pyarrow_serialize'] = True else: reader_factory = make_batch_reader # Petastorm: read data from the store with the correct shard for this rank # setting num_epochs=None will cause an infinite iterator # and enables ranks to perform training and validation with # unequal number of samples with reader_factory(remote_store.train_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type='process', workers_count=train_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, **reader_factory_kwargs) as train_reader: with reader_factory(remote_store.val_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type='process', workers_count=val_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, **reader_factory_kwargs) \ if should_validate else empty_batch_reader() as val_reader: train_loader = BatchedDataLoader(train_reader, batch_size=batch_size, shuffling_queue_capacity=shuffle_buffer_size) train_loader_iter = iter(train_loader) def prepare_batch(row): inputs = [ prepare_np_data( row[col].float(), col, metadata).reshape(shape) for col, shape in zip(feature_columns, input_shapes)] labels = [ prepare_np_data( row[col].float(), col, metadata) for col in label_columns] sample_weights = row.get(sample_weight_col, None) if sample_weights is not None: sample_weights = sample_weights.float() if cuda_available: inputs = [input.cuda() for input in inputs] labels = [label.cuda() for label in labels] if sample_weights: sample_weights = sample_weights.cuda() return inputs, labels, sample_weights def transform_outputs(outputs, labels): if not isinstance(outputs, tuple) and not isinstance(outputs, list): outputs = [outputs] # reshape labels to match the output shape of the model if hasattr(outputs[0], 'shape'): if label_shapes: labels = [label.reshape(label_shape) for label, label_shape in zip(labels, label_shapes)] else: # If label_shapes parameter is not provided, reshape the label # columns data to match the shape of the model output labels = [label.reshape(output.shape) if output.shape.numel() == label.shape.numel() else label for label, output in zip(labels, outputs)] return outputs, labels def aggregate_metrics(stage, epoch, loss, metric_value_groups): all_metric_groups_values = get_metric_avgs(metric_value_groups) if remote_store.saving_runs: write_metrics_summary( stage, epoch, loss, all_metric_groups_values, log_writer) return { loss.name: loss.avg.item(), 'all_metrics': all_metric_groups_values } def loss_fn(outputs, labels, sample_weights): loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) return loss def print_metrics(batch_idx, loss, metric_value_groups, phase): if user_verbose > 0 and hvd.rank() == 0 and \ batch_idx % METRIC_PRINT_FREQUENCY == 0: print("epoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}". format(epoch=epoch, batch_idx=batch_idx, metrics=aggregate_metrics(phase, epoch, loss, metric_value_groups))) def _train(epoch): model.train() train_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(steps_per_epoch): row = next(train_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs, loss = train_minibatch(model, optimizer, transform_outputs, loss_fn, inputs, labels, sample_weights) update_metrics(metric_value_groups, outputs, labels) train_loss.update(loss) print_metrics(batch_idx, train_loss, metric_value_groups, 'train') return aggregate_metrics('train', epoch, train_loss, metric_value_groups) if should_validate: val_loader = BatchedDataLoader(val_reader, batch_size=batch_size) val_loader_iter = iter(val_loader) if validation_steps_per_epoch is None: validation_steps = int(math.ceil(float(val_rows) / batch_size / hvd.size())) else: validation_steps = validation_steps_per_epoch def _validate(epoch): model.eval() val_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(validation_steps): row = next(val_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs = model(*inputs) outputs, labels = transform_outputs(outputs, labels) loss = calculate_loss( outputs, labels, loss_weights, loss_fns, sample_weights) val_loss.update(loss) update_metrics(metric_value_groups, outputs, labels) print_metrics(batch_idx, val_loss, metric_value_groups, 'val') return aggregate_metrics('val', epoch, val_loss, metric_value_groups) history = [] for epoch in range(epochs): epoch_metrics = { 'epoch': epoch, 'train': _train(epoch) } if should_validate: epoch_metrics['validation'] = _validate(epoch) if user_verbose > 0: print(epoch_metrics) history.append(epoch_metrics) if hvd.rank() == 0: # Save model after every epoch save_checkpoint() if remote_store.saving_runs: remote_store.sync(run_output_dir) if hvd.rank() == 0: best_checkpoint = torch.load(ckpt_file) serialized_checkpoint = io.BytesIO() torch.save(best_checkpoint, serialized_checkpoint) serialized_checkpoint.seek(0) return history, serialized_checkpoint
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if rank != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(json.load(open(hps_file))) assert opts.split in opts.img_db and opts.split in opts.txt_db # load DBs and image dirs eval_img_db, eval_img_db_gt = load_img_feat(opts.img_db, model_opts) eval_txt_db = VcrTxtTokLmdb(opts.txt_db, -1) eval_dataset = VcrEvalDataset("val", eval_txt_db, img_db=eval_img_db, img_db_gt=eval_img_db_gt) # Prepare model model = UniterForVisualCommonsenseReasoning.from_pretrained( f'{opts.output_dir}/log/model.json', state_dict={}, img_dim=IMG_DIM) model.init_type_embedding() model.init_type_embedding_know() model.init_word_embedding(NUM_SPECIAL_TOKENS) if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) state_dict = checkpoint.get('model_state', checkpoint) matched_state_dict = {} unexpected_keys = set() missing_keys = set() for name, param in model.named_parameters(): missing_keys.add(name) for key, data in state_dict.items(): if key in missing_keys: matched_state_dict[key] = data missing_keys.remove(key) else: unexpected_keys.add(key) LOGGER.info(f"Unexpected_keys: {list(unexpected_keys)}") LOGGER.info(f"Missing_keys: {list(missing_keys)}") model.load_state_dict(matched_state_dict, strict=False) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=True, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, shuffle=False, collate_fn=vcr_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) results = evaluate(model, eval_dataloader) output = '/src/vlkaf.json' before_json = "" for i, item in enumerate(results): jstring = json.dumps(item) before_json += jstring + '\n' f = open(output, "w") f.write(before_json) f.close() '''
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading Train Dataset {opts.train_txt_db}, " f"{opts.train_img_db}") if 'paired' in opts.model: DatasetCls = Nlvr2PairedDataset EvalDatasetCls = Nlvr2PairedEvalDataset collate_fn = nlvr2_paired_collate eval_collate_fn = nlvr2_paired_eval_collate if opts.model == 'paired': ModelCls = UniterForNlvr2Paired elif opts.model == 'paired-attn': ModelCls = UniterForNlvr2PairedAttn else: raise ValueError('unrecognized model type') elif opts.model == 'triplet': DatasetCls = Nlvr2TripletDataset EvalDatasetCls = Nlvr2TripletEvalDataset ModelCls = UniterForNlvr2Triplet collate_fn = nlvr2_triplet_collate eval_collate_fn = nlvr2_triplet_eval_collate else: raise ValueError('unrecognized model type') # data loaders train_dataloader = create_dataloader(opts.train_img_db, opts.train_txt_db, opts.train_batch_size, True, DatasetCls, collate_fn, opts) val_dataloader = create_dataloader(opts.val_img_db, opts.val_txt_db, opts.val_batch_size, False, EvalDatasetCls, eval_collate_fn, opts) test_dataloader = create_dataloader(opts.test_img_db, opts.test_txt_db, opts.val_batch_size, False, EvalDatasetCls, eval_collate_fn, opts) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = ModelCls.from_pretrained(opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.init_type_embedding() model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'results')) # store val predictions add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataloader.dataset)) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(train_dataloader): targets = batch['targets'] n_examples += targets.size(0) loss = model(batch, compute_loss=True) loss = loss.mean() delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'Step {global_step}: ' f'{tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: for split, loader in [('val', val_dataloader), ('test', test_dataloader)]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") log, results = validate(model, loader, split) with open( f'{opts.output_dir}/results/' f'{split}_results_{global_step}_' f'rank{rank}.csv', 'w') as f: for id_, ans in results: f.write(f'{id_},{ans}\n') TB_LOGGER.log_scaler_dict(log) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"Step {global_step}: finished {n_epoch} epochs") if opts.num_train_steps % opts.valid_steps != 0: for split, loader in [('val', val_dataloader), ('test', test_dataloader)]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") log, results = validate(model, loader, split) with open( f'{opts.output_dir}/results/' f'{split}_results_{global_step}_' f'rank{rank}.csv', 'w') as f: for id_, ans in results: f.write(f'{id_},{ans}\n') TB_LOGGER.log_scaler_dict(log) model_saver.save(model, global_step)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) # load DBs and image dirs all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) # train LOGGER.info(f"Loading Train Dataset " f"{opts.train_txt_dbs}, {opts.train_img_dbs}") train_datasets = [] for txt_path, img_path in zip(opts.train_txt_dbs, opts.train_img_dbs): img_db, img_db_gt = load_img_feat(img_path, all_img_dbs, opts) qa_txt_db = VcrTxtTokLmdb(txt_path, opts.max_txt_len, task="qa") qar_txt_db = VcrTxtTokLmdb(txt_path, opts.max_txt_len, task="qar") train_datasets.append( VcrDataset(qa_txt_db, img_db_gt=img_db_gt, img_db=img_db)) train_datasets.append( VcrDataset(qar_txt_db, img_db_gt=img_db_gt, img_db=img_db)) train_dataset = ConcatDatasetWithLens(train_datasets) train_dataloader = build_dataloader(train_dataset, vcr_collate, True, opts) # val LOGGER.info(f"Loading Val Dataset {opts.val_txt_db}, {opts.val_img_db}") val_img_db, val_img_db_gt = load_img_feat(opts.val_img_db, all_img_dbs, opts) val_txt_db = VcrTxtTokLmdb(opts.val_txt_db, -1) val_dataset = VcrEvalDataset("val", val_txt_db, img_db=val_img_db, img_db_gt=val_img_db_gt) val_final_dataset = VcrEvalDataset("test", val_txt_db, img_db=val_img_db, img_db_gt=val_img_db_gt) val_dataloader = build_dataloader(val_dataset, vcr_eval_collate, False, opts) val_final_dataloader = build_dataloader(val_final_dataset, vcr_eval_collate, False, opts) # Prepare model if opts.checkpoint and opts.checkpoint_from == "pretrain": ckpt = torch.load(opts.checkpoint) checkpoint = {k.replace('bert', 'uniter'): v for k, v in ckpt.items()} else: checkpoint = {} all_dbs = opts.train_txt_dbs + [opts.val_txt_db] toker = json.load(open(f'{all_dbs[0]}/meta.json'))['bert'] assert all(toker == json.load(open(f'{db}/meta.json'))['bert'] for db in all_dbs) model = UniterForVisualCommonsenseReasoning.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM) model.init_type_embedding() model.init_word_embedding(NUM_SPECIAL_TOKENS) if opts.checkpoint_from == "vcr_pretrain": ckpt = torch.load(opts.checkpoint) checkpoint = {k.replace('bert', 'uniter'): v for k, v in ckpt.items()} state_dict = checkpoint.get('model_state', checkpoint) matched_state_dict = {} unexpected_keys = set() missing_keys = set() for name, param in model.named_parameters(): missing_keys.add(name) for key, data in state_dict.items(): if key in missing_keys: matched_state_dict[key] = data missing_keys.remove(key) else: unexpected_keys.add(key) print("Unexpected_keys:", list(unexpected_keys)) print("Missing_keys:", list(missing_keys)) model.load_state_dict(matched_state_dict, strict=False) del checkpoint model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'results')) # store VQA predictions add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataset) * hvd.size()) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(train_dataloader): n_examples += batch['input_ids'].size(0) # ============= Code for adversarial training ============= if opts.adv_training: # initialize delta txt_embeds_init = model.uniter.embeddings.word_embeddings( batch['input_ids']) img_embeds_init = batch['img_feat'] # for simplicity, we initialize the delta as zero vectors, which performs # very simliar as initializing randomly using norm or uniform distributions txt_delta = torch.zeros_like(txt_embeds_init) img_delta = torch.zeros_like(img_embeds_init) # calculate the prob. scores for clean samples gt_answer_scores = model(batch, compute_loss=False) gt_answer_prob = F.softmax(gt_answer_scores, dim=1) gt_answer_logprob = F.log_softmax(gt_answer_scores, dim=1) # the main loop for astep in range(opts.adv_steps): # (0) forward if opts.adv_modality == ["text"]: txt_delta.requires_grad_() img_delta = torch.zeros_like(img_embeds_init) elif opts.adv_modality == ["image"]: img_delta.requires_grad_() txt_delta = torch.zeros_like(txt_embeds_init) else: txt_delta.requires_grad_() img_delta.requires_grad_() if "alter" not in opts.adv_modality: answer_scores = model(batch, adv_training=True, adv_modality=opts.adv_modality, adv_delta_txt=txt_delta, adv_delta_img=img_delta, compute_loss=False) # CE loss ce_loss = F.cross_entropy(answer_scores, batch['targets'].squeeze(-1), reduction='mean') # KL loss answer_prob = F.softmax(answer_scores, dim=1) answer_logprob = F.log_softmax(answer_scores, dim=1) kl_loss = F.kl_div( answer_logprob, gt_answer_prob, reduction='none') + \ F.kl_div( gt_answer_logprob, answer_prob, reduction='none') kl_loss = kl_loss.mean() # (1) backward loss = (ce_loss + opts.adv_kl_weight * kl_loss) / opts.adv_steps else: answer_scores_1 = model(batch, adv_training=True, adv_modality=["text"], adv_delta_txt=txt_delta, adv_delta_img=None, compute_loss=False) # CE loss ce_loss_1 = F.cross_entropy( answer_scores, batch['targets'].squeeze(-1), reduction='mean') answer_scores_2 = model(batch, adv_training=True, adv_modality=["image"], adv_delta_txt=None, adv_delta_img=img_delta, compute_loss=False) # CE loss ce_loss_2 = F.cross_entropy( answer_scores, batch['targets'].squeeze(-1), reduction='mean') # KL loss answer_prob_1 = F.softmax(answer_scores_1, dim=1) answer_logprob_1 = F.log_softmax(answer_scores_1, dim=1) answer_prob_2 = F.softmax(answer_scores_2, dim=1) answer_logprob_2 = F.log_softmax(answer_scores_2, dim=1) kl_loss_1 = F.kl_div( answer_logprob_1, gt_answer_prob, reduction='none') + \ F.kl_div( gt_answer_logprob, answer_prob_1, reduction='none') kl_loss_1 = kl_loss_1.mean() kl_loss_2 = F.kl_div( answer_logprob_2, gt_answer_prob, reduction='none') + \ F.kl_div( gt_answer_logprob, answer_prob_2, reduction='none') kl_loss_2 = kl_loss_2.mean() # (1) backward loss = (ce_loss_1 + ce_loss_2 + opts.adv_kl_weight * (kl_loss_1 + kl_loss_2)) / (opts.adv_steps * 2) delay_unscale = ( (step + 1) % opts.gradient_accumulation_steps != 0) or ((astep + 1) % opts.adv_steps != 0) with amp.scale_loss( loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward(retain_graph=True) if not delay_unscale: # gather gradients from every processes # do this before unscaling # to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if astep == opts.adv_steps - 1: # further updates on delta break # (2) get gradient on delta # fix fp16 problem amp_scale = scaled_loss.item() // loss.item() if "text" in opts.adv_modality: txt_delta_grad = txt_delta.grad.clone().detach() txt_delta_grad = txt_delta_grad.float() / amp_scale if "image" in opts.adv_modality: img_delta_grad = img_delta.grad.clone().detach() img_delta_grad = img_delta_grad.float() / amp_scale # (3) update and clip for txt delta if "text" in opts.adv_modality: if opts.norm_type == "l2": denorm = torch.norm(txt_delta_grad.view( txt_delta_grad.size(0), -1), dim=1).view(-1, 1, 1) denorm = torch.clamp(denorm, min=1e-8) txt_delta_step = (opts.adv_lr_txt * txt_delta_grad / denorm).to(txt_delta) txt_delta = (txt_delta + txt_delta_step).detach() if opts.adv_max_norm > 0: delta_norm = torch.norm(txt_delta.view( txt_delta.size(0), -1), p=2, dim=1).detach() exceed_mask = (delta_norm > opts.adv_max_norm ).to(txt_embeds_init) reweights = (opts.adv_max_norm / delta_norm * exceed_mask + (1 - exceed_mask)).view(-1, 1, 1) txt_delta = (txt_delta * reweights).detach() elif opts.norm_type == "linf": denorm = torch.norm(txt_delta_grad.view( txt_delta_grad.size(0), -1), dim=1, p=float("inf")).view(-1, 1, 1) denorm = torch.clamp(denorm, min=1e-8) txt_delta_step = (opts.adv_lr_txt * txt_delta_grad / denorm).to(txt_delta) txt_delta = (txt_delta + txt_delta_step).detach() if opts.adv_max_norm > 0: txt_delta = torch.clamp( txt_delta, -opts.adv_max_norm, opts.adv_max_norm).detach() # (4) update and clip for image delta if "image" in opts.adv_modality: if opts.norm_type == "l2": denorm = torch.norm(img_delta_grad.view( img_delta_grad.size(0), -1), dim=1).view(-1, 1, 1) denorm = torch.clamp(denorm, min=1e-8) img_delta_step = (opts.adv_lr_img * img_delta_grad / denorm).to(img_delta) img_delta = (img_delta + img_delta_step).detach() if opts.adv_max_norm > 0: delta_norm = torch.norm(img_delta.view( img_delta.size(0), -1), p=2, dim=1).detach() exceed_mask = (delta_norm > opts.adv_max_norm ).to(img_embeds_init) reweights = (opts.adv_max_norm / delta_norm * exceed_mask + (1 - exceed_mask)).view(-1, 1, 1) img_delta = (img_delta * reweights).detach() elif opts.norm_type == "linf": denorm = torch.norm(img_delta_grad.view( img_delta_grad.size(0), -1), dim=1, p=float("inf")).view(-1, 1, 1) denorm = torch.clamp(denorm, min=1e-8) img_delta_step = (opts.adv_lr_img * img_delta_grad / denorm).to(img_delta) img_delta = (img_delta + img_delta_step).detach() if opts.adv_max_norm > 0: img_delta = torch.clamp( img_delta, -opts.adv_max_norm, opts.adv_max_norm).detach() else: loss = model(batch, compute_loss=True) loss = loss.mean() delay_unscale = ((step + 1) % opts.gradient_accumulation_steps != 0) with amp.scale_loss( loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) # ============================ End ========================== if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i == 0 or i == 1: param_group['lr'] = lr_this_step * opts.lr_mul elif i == 2 or i == 3: param_group['lr'] = lr_this_step else: raise ValueError() TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f'============Step {global_step}=============') tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec, global_step) LOGGER.info('===========================================') if global_step % opts.valid_steps == 0: val_log, results = validate(model, val_dataloader) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"finished {n_epoch} epochs") if global_step % opts.valid_steps != 0: val_log, results = validate(model, val_dataloader) TB_LOGGER.log_scaler_dict(val_log) val_log, results = validate(model, val_final_dataloader) with open( f'{opts.output_dir}/results/' f'results_{global_step}_final_qa_qar_' f'rank{rank}.json', 'w') as f: json.dump(results, f) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step)
def get_style_attribute_pairs( ): # this function is written with horovod to accelerate the extraction (by n_gpu times) import horovod.torch as hvd hvd.init() torch.cuda.set_device(hvd.local_rank()) torch.manual_seed(hvd.rank() * 999 + 1) if hvd.rank() == 0: print(' * Extracting style-attribute pairs...') # build and load the pre-trained attribute predictor on CelebA-HQ predictor = models.get_pretrained('attribute-predictor').to(device) # build and load the pre-trained anycost generator generator = models.get_pretrained('generator', config).to(device) predictor.eval() generator.eval() # randomly generate images and feed them to the predictor # configs from https://github.com/genforce/interfacegan randomized_noise = False truncation_psi = 0.7 batch_size = 16 n_batch = 500000 // (batch_size * hvd.size()) styles = [] attributes = [] mean_style = generator.mean_style(100000).view(1, 1, -1) assert space in ['w', 'w+', 'z'] for _ in tqdm(range(n_batch), disable=hvd.rank() != 0): if space in ['w', 'z']: z = torch.randn(batch_size, 1, generator.style_dim, device=device) else: z = torch.randn(batch_size, generator.n_style, generator.style_dim, device=device) images, w = generator(z, return_styles=True, truncation=truncation_psi, truncation_style=mean_style, input_is_style=False, randomize_noise=randomized_noise) images = F.interpolate(images.clamp(-1, 1), size=256, mode='bilinear', align_corners=True) attr = predictor(images) # move to cpu to save memory if space == 'w+': styles.append(w.to('cpu')) elif space == 'w': styles.append(w.mean( 1, keepdim=True).to('cpu')) # originally duplicated else: styles.append(z.to('cpu')) attributes.append(attr.to('cpu')) styles = torch.cat(styles, dim=0) attributes = torch.cat(attributes, dim=0) styles = hvd.allgather(styles, name='styles') attributes = hvd.allgather(attributes, name='attributes') if hvd.rank() == 0: print(styles.shape, attributes.shape) torch.save(attributes, 'attributes_{}.pt'.format(config)) torch.save(styles, 'styles_{}.pt'.format(config))
def setup(config): data_dir = config.get("data_dir", None) seed = config.get("seed", 42) batch_size = config.get("batch_size", 64) use_adasum = config.get("use_adasum", False) lr = config.get("lr", 0.01) momentum = config.get("momentum", 0.5) use_cuda = config.get("use_cuda", False) # Horovod: initialize library. hvd.init() torch.manual_seed(seed) if use_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} data_dir = data_dir or "~/data" with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not use_adasum else 1 if use_cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=lr * lr_scaler, momentum=momentum) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), op=hvd.Adasum if use_adasum else hvd.Average) return model, optimizer, train_loader, train_sampler
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, "log")) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(args.output_dir, "ckpt")) add_log_to_file(join(opts.output_dir, "log", "log.txt")) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() all_dbs = [ db for datasets in [opts.train_datasets, opts.val_datasets] for dset in datasets for db in dset["db"] ] tokenizer = json.load(open(f"{all_dbs[0]}/meta.json"))["bert"] assert all(tokenizer == json.load(open(f"{db}/meta.json"))["bert"] for db in all_dbs) # build data loaders train_dataloaders, all_img_dbs = create_dataloaders( opts.train_datasets, True, opts) val_dataloaders, _ = create_dataloaders(opts.val_datasets, False, opts, all_img_dbs) meta_loader = MetaLoader( train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1, ) meta_loader = PrefetchLoader(meta_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = UniterForPretraining.from_pretrained(opts.model_config, checkpoint, img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM) model.to(device) model.train() # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level="O2") global_step = 0 LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) # to compute training statistics task2loss = { task: RunningMeter(f"loss/{task}") for task in train_dataloaders.keys() } # ITM w/ OT if opts.itm_ot_lambda > 0: for task in train_dataloaders.keys(): if task.startswith("itm"): task2loss[f"{task}_xe"] = RunningMeter(f"loss/{task}_xe") task2loss[f"{task}_ot"] = RunningMeter(f"loss/{task}_ot") task2loss[f"{task}_ot_pos"] = RunningMeter( f"loss/{task}_ot_pos") task2loss[f"{task}_ot_neg"] = RunningMeter( f"loss/{task}_ot_neg") n_examples = defaultdict(int) n_in_units = defaultdict(int) n_loss_units = defaultdict(int) grad_norm = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() for step, (name, batch) in enumerate(meta_loader): # forward pass n_examples[name] += batch["input_ids"].size(0) n_in_units[name] += (batch["attn_masks"] == 1).sum().item() task = name.split("_")[0] loss = model(batch, task=task, compute_loss=True) if task.startswith("itm"): # OT itm_loss, ot_loss = loss n_loss_units[name] += itm_loss.size(0) itm_loss = itm_loss.mean() if ot_loss is not None: ot_pos, ot_neg = ot_loss ot_loss = (ot_pos.sum() - ot_neg.sum()) / (ot_pos.size(0) + ot_neg.size(0)) # NOTE: be ware of empty tensor ot_pos = ot_pos.mean().item() if not math.isnan(ot_pos): task2loss[f"{name}_ot_pos"](ot_pos) ot_neg = ot_neg.mean().item() if not math.isnan(ot_neg): task2loss[f"{name}_ot_neg"](ot_neg) loss = itm_loss + opts.itm_ot_lambda * ot_loss task2loss[f"{name}_xe"](itm_loss.item()) task2loss[f"{name}_ot"](ot_loss.item()) else: loss = itm_loss else: n_loss_units[name] += loss.size(0) loss = loss.mean() # loss is not normalized in model # backward pass delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[name]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) task2loss[name](loss.item()) # optimizer update and logging if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step TB_LOGGER.add_scalar("lr", lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.log_scaler_dict({ l.name: l.val for l in task2loss.values() if l.val is not None }) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar("grad_norm", grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f"==============Step {global_step}===============") for t in train_dataloaders.keys(): assert all(tt == t for tt in all_gather_list(t)) tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) tot_in = sum(all_gather_list(n_in_units[t])) in_per_sec = int(tot_in / (time() - start)) tot_l = sum(all_gather_list(n_loss_units[t])) l_per_sec = int(tot_l / (time() - start)) LOGGER.info(f"{t}: {tot_ex} examples trained at " f"{ex_per_sec} ex/s") TB_LOGGER.add_scalar(f"perf/{t}_ex_per_s", ex_per_sec, global_step) TB_LOGGER.add_scalar(f"perf/{t}_in_per_s", in_per_sec, global_step) TB_LOGGER.add_scalar(f"perf/{t}_loss_per_s", l_per_sec, global_step) LOGGER.info(f"===============================================") if global_step % opts.valid_steps == 0: LOGGER.info(f"Step {global_step}: start validation") validate(model, val_dataloaders) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step % opts.valid_steps != 0: LOGGER.info(f"Step {global_step}: start validation") validate(model, val_dataloaders) model_saver.save(model, global_step)
def setup(self, model): # call setup after the ddp process has connected self.trainer.call_setup_hook(model) if torch.cuda.is_available() and self.trainer.on_gpu: # Horovod: pin GPU to local rank assert self.trainer.root_gpu == hvd.local_rank() torch.cuda.set_device(self.trainer.root_gpu) model.cuda(self.trainer.root_gpu) # avoid duplicating progress bar if hvd.rank() != 0 and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # CHOOSE OPTIMIZER # allow for lr schedulers as well optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers( model) self.trainer.optimizers = optimizers self.trainer.lr_schedulers = lr_schedulers self.trainer.optimizer_frequencies = optimizer_frequencies # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in self.trainer.optimizers: for param_group in optimizer.param_groups: param_group['lr'] *= hvd.size() # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR for scheduler in self.trainer.lr_schedulers: scheduler = scheduler['scheduler'] if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [ lr * hvd.size() for lr in scheduler.base_lrs ] # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.trainer.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def filter_named_parameters(model, optimizer): opt_params = set([ p for group in optimizer.param_groups for p in group.get('params', []) ]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce self.trainer.optimizers = [ hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters( model, optimizer)) for optimizer in self.trainer.optimizers ] # 16-bit model, self.trainer.optimizers = self.trainer.precision_connector.connect( model, self.trainer.optimizers) # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.trainer.global_rank = hvd.rank() rank_zero_only.rank = self.trainer.global_rank self.trainer.model = model
help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(load_json(hps_file)) model_config = f'{opts.output_dir}/log/model_config.json' # load DBs and image dirs video_ids = get_video_ids(opts.query_txt_db) if opts.task != "didemo_video_only": video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) else: txt_meta = load_json(os.path.join(opts.query_txt_db, "meta.json")) video_db = load_video_only_dataset(opts.vfeat_db, txt_meta, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db q_txt_db = QueryTokLmdb(opts.query_txt_db, -1) if opts.task != "didemo_video_only": inf_dataset = VcmrFullEvalDataset else: inf_dataset = VcmrVideoOnlyFullEvalDataset eval_dataset = inf_dataset(video_ids, video_db, q_txt_db, distributed=model_opts.distributed_eval) # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) img_pos_embed_weight_key = ("v_encoder.f_encoder.img_embeddings" + ".position_embeddings.weight") assert img_pos_embed_weight_key in checkpoint max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) model = HeroForVcmr.from_pretrained( model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=model_opts.lw_neg_ctx, lw_neg_q=model_opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=model_opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=model_opts.hard_pool_size, margin=model_opts.margin, use_all_neg=model_opts.use_all_neg, drop_svmr_prob=model_opts.drop_svmr_prob) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) _, results = validate_full_vcmr(model, eval_dataloader, opts.split, opts, model_opts) result_dir = f'{opts.output_dir}/results_{opts.split}' if not exists(result_dir) and rank == 0: os.makedirs(result_dir) all_results_list = all_gather_list(results) if hvd.rank() == 0: all_results = {"video2idx": all_results_list[0]["video2idx"]} for rank_id in range(hvd.size()): for key, val in all_results_list[rank_id].items(): if key == "video2idx": continue if key not in all_results: all_results[key] = [] all_results[key].extend(all_results_list[rank_id][key]) LOGGER.info('All results joined......') save_json(all_results, f'{result_dir}/results_{opts.checkpoint}_all.json') LOGGER.info('All results written......')
def train(serialized_model): import horovod.torch as hvd # Horovod: initialize library. hvd.init() with tempfile.TemporaryDirectory( ) as last_ckpt_dir, remote_store.get_local_output_dir( ) as run_output_dir: last_ckpt_file = os.path.join(last_ckpt_dir, 'last.ckpt') if ckpt_bytes: with open(last_ckpt_file, 'wb') as f: f.write(ckpt_bytes) # TODO: Pass the logger from estimator constructor logs_path = os.path.join(run_output_dir, remote_store.logs_subdir) # Use default logger if no logger is supplied train_logger = logger if train_logger is None: train_logger = TensorBoardLogger(logs_path) # TODO: find out a way to use ckpt_path created from remote store, but all other parameters ingest from estimator config # ckpt_path = os.path.join(run_output_dir, remote_store.checkpoint_filename) # os.makedirs(ckpt_path, exist_ok=True) # model_checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path) # callbacks.append(model_checkpoint_callback) is_model_checkpoint_callback_exist = False if callbacks is not None: for cb in callbacks: if isinstance(cb, ModelCheckpoint): is_model_checkpoint_callback_exist = True break model = deserialize(serialized_model) _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \ int(math.floor(float(train_rows) / batch_size / hvd.size())) _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \ int(math.floor(float(val_rows) / val_batch_size / hvd.size())) print( f"Training data of rank[{hvd.local_rank()}]: train_rows:{train_rows}, batch_size:{batch_size}, _train_steps_per_epoch:{_train_steps_per_epoch}." ) print( f"Validation data of rank[{hvd.local_rank()}]: val_rows:{val_rows}, val_batch_size:{val_batch_size}, _val_steps_per_epoch:{_val_steps_per_epoch}, should_validate:{should_validate}" ) cuda_available = torch.cuda.is_available() # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device( _get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() _num_gpus = num_gpus if _num_gpus is None: _num_gpus = 1 if cuda_available else 0 kwargs = { 'accelerator': 'horovod', 'gpus': _num_gpus, 'callbacks': callbacks, 'max_epochs': epochs, 'logger': train_logger, 'log_every_n_steps': log_every_n_steps, 'resume_from_checkpoint': (last_ckpt_file if ckpt_bytes else None), 'checkpoint_callback': is_model_checkpoint_callback_exist, 'num_sanity_val_steps': 0, 'reload_dataloaders_every_epoch': False, 'progress_bar_refresh_rate': _train_steps_per_epoch // 10 } print("Creating trainer with: \n ", kwargs) trainer = Trainer(**kwargs) print(f"pytorch_lightning version={pl.__version__}") # print row group # pq.ParquetFile(remote_store.train_data_path) # for rowgroup in range(pq_file.metadata.num_row_groups): # row_group = pq_file.metadata.row_group(rowgroup) # print(row_group) with set_data_loader(model, remote_store.train_data_path, 'train_dataloader', train_reader_worker_count, reader_pool_type, calculate_shuffle_buffer_size(), name="train_dataloader", limit_step_per_epoch=_train_steps_per_epoch), \ set_data_loader(model, remote_store.val_data_path, 'val_dataloader', val_reader_worker_count, reader_pool_type, 0, should_validate, name="val_dataloader", limit_step_per_epoch=_val_steps_per_epoch): trainer.fit(model) serialized_checkpoint = io.BytesIO() module = model if not is_legacy else model._model # TODO: find a way to pass trainer.logged_metrics out. output = {'model': module.state_dict()} torch.save(output, serialized_checkpoint) serialized_checkpoint.seek(0) return serialized_checkpoint
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) if opts.train_config is not None: train_opts = Struct(json.load(open(opts.train_config))) opts.conf_th = train_opts.conf_th opts.max_bb = train_opts.max_bb opts.min_bb = train_opts.min_bb opts.num_bb = train_opts.num_bb # load DBs and image dirs eval_img_db = DetectFeatLmdb(opts.img_db, opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) eval_txt_db = TxtTokLmdb(opts.txt_db, -1) eval_dataset = ItmEvalDataset(eval_txt_db, eval_img_db, opts.batch_size) # Prepare model ckpt = torch.load(opts.checkpoint) checkpoint = {k.replace('bert', 'uniter'): v for k, v in ckpt.items()} model = UniterForImageTextRetrieval.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM) if 'rank_output' not in checkpoint: model.init_output() # zero shot setting model.to(device) model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=1, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=itm_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) eval_log, results = evaluate(model, eval_dataloader) if hvd.rank() == 0: if not exists(opts.output_dir) and rank == 0: os.makedirs(opts.output_dir) with open(f'{opts.output_dir}/config.json', 'w') as f: json.dump(vars(opts), f) with open(f'{opts.output_dir}/results.bin', 'wb') as f: pickle.dump(results, f) with open(f'{opts.output_dir}/scores.json', 'w') as f: json.dump(eval_log, f) LOGGER.info(f'evaluation finished') LOGGER.info( f"======================== Results =========================\n" f"image retrieval R1: {eval_log['img_r1']*100:.2f},\n" f"image retrieval R5: {eval_log['img_r5']*100:.2f},\n" f"image retrieval R10: {eval_log['img_r10']*100:.2f}\n" f"text retrieval R1: {eval_log['txt_r1']*100:.2f},\n" f"text retrieval R5: {eval_log['txt_r5']*100:.2f},\n" f"text retrieval R10: {eval_log['txt_r10']*100:.2f}") LOGGER.info("========================================================")
def training_function(config): # Horovod: initialize library. Init horovod and GPU hvd.init() if CUDA: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) # DATASET data = np.load(BASE_DIR + '/mnist.npz', allow_pickle=True) mnist_images_train = np.expand_dims(data['x_train'], 1) mnist_labels_train = data['y_train'] mnist_images_test = np.expand_dims(data['x_test'], 1) mnist_labels_test = data['y_test'] data.close() dataset_train = dt.TensorDataset(torch.Tensor(mnist_images_train), torch.Tensor(mnist_labels_train).long()) dataset_test = dt.TensorDataset(torch.Tensor(mnist_images_test), torch.Tensor(mnist_labels_test).long()) train_sampler = torch.utils.data.distributed.DistributedSampler( dataset_train, num_replicas=hvd.size(), rank=hvd.rank()) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=50, sampler=train_sampler) test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=50, sampler=test_sampler) model = CNNClassifier() lr_scaler = hvd.size() if CUDA: # Move model to GPU. model.cuda() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=config["lr"] * lr_scaler, momentum=0.5) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. hvd.Compression.fp16 compression = hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, ) def train(epoch): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for data, target in train_loader: if CUDA: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() train_avg_loss = metric_average(loss.item(), "train_avg_loss") if hvd.rank() == 0: print(f"Train Epoch: {epoch} Avg_loss: {train_avg_loss}") # TODO save data for tensorboard def test(epoch): model.eval() test_loss = 0.0 test_accuracy = 0.0 for data, target in test_loader: if CUDA: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq( target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, "avg_loss") test_accuracy = metric_average(test_accuracy, "avg_accuracy") if hvd.rank() == 0: print( "\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n".format( test_loss, 100.0 * test_accuracy)) # Horovod: print output only on first rank. if hvd.rank() == 0: tune.report(loss=(test_loss), accuracy=test_accuracy) mlflow.log_metric("Test loss", test_loss, step=epoch) # add mlflow metrics mlflow.log_metric("Accuracy", test_accuracy, step=epoch) # add mlflow metrics if hvd.rank() == 0: mlflow.set_tracking_uri("file:/home/jovyan/mlruns") mlflow.set_experiment("mlflow_example_default_ray_final") mlflow.start_run(nested=False, run_name="y") for epoch in range(1, config["epochs"] + 1): train(epoch) test(epoch) if hvd.rank() == 0: mlflow.end_run()
def batch_translate(self, input_path, output_path, field=0, remove_subword_tokens=True, max_length=100, resume=False): """Translate a file.""" # Check whether using multiple GPUs try: import horovod.torch as hvd except ImportError: pass # If using multigpu, then separate the input file if self._is_multigpu: sync_tensor = torch.tensor(0) tmp_output_path = "/tmp/{}.{}".format( os.path.basename(output_path), hvd.local_rank()) else: sync_tensor = None tmp_output_path = output_path result_map = {} if self._is_multigpu and resume and os.path.exists(tmp_output_path): for line in open(tmp_output_path): pair = line.strip("\n").split("\t") if len(pair) != 2: print(line) id, line = pair result_map[int(id)] = line print("loaded {} computed results".format(len(result_map))) fout = open(tmp_output_path, "w") test_lines = list(open(input_path)) err = 0 for i, line in enumerate(test_lines): # Gather error counts in multigpu mode if self._is_multigpu: if i % (10 * hvd.size()) == 0: sync_tensor.fill_(err) hvd.allreduce_(sync_tensor, average=False) if i % hvd.size() != hvd.local_rank(): continue # Translate pair = line.strip().split("\t") src_sent = pair[field] if len(src_sent.split()) > max_length: result = "x" else: if i in result_map: result = result_map[i] else: result, _ = self.translate("<s> {} </s>".format(src_sent)) if result is None: result = "" if remove_subword_tokens: if "▁" in result: result = "".join(result.split()).replace("▁", " ").strip() else: result = result.replace("@@ ", "") if not result: err += 1 # Write the results and print progress if self._is_multigpu: fout.write("{}\t{}\n".format(i, result)) else: fout.write("{}\n".format(result)) fout.flush() if self._is_multigpu and hvd.local_rank() == 0: sys.stdout.write("translating: {:.0f}% err: {} \r".format( float(i + 1) * 100 / len(test_lines), int(sync_tensor))) elif not self._is_multigpu: sys.stdout.write("translating: {:.0f}% err: {} \r".format( float(i + 1) * 100 / len(test_lines), err)) sys.stdout.flush() if is_root_node(): sys.stdout.write("\n") fout.close() if self._is_multigpu: # Wait for all process to end hvd.allreduce_(sync_tensor, average=False) # Concatenate all separated translation results if hvd.local_rank() == 0: results = [] for i in range(hvd.size()): for line in open("/tmp/{}.{}".format( os.path.basename(output_path), i)): id, result = line.strip("\n").split("\t") results.append((int(id), result)) results.sort() with open(output_path, "w") as fout: for _, result in results: fout.write(result + "\n")
def train(): hvd.init() # Data augmentation and normalization for training # Just normalization for validation data_transforms = { 'train': transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } data_dir = 'hymenoptera_data' image_datasets = { x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val'] } dataloaders = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4, shuffle=True, num_workers=4) for x in ['train', 'val'] } dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} class_names = image_datasets['train'].classes device = torch.device("cuda:{}".format(hvd.local_rank())) print('device:', device) model_ft = models.resnet50(pretrained=False) num_ftrs = model_ft.fc.in_features # Here the size of each output sample is set to 2. # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)). model_ft.fc = nn.Linear(num_ftrs, 2) model_ft = model_ft.to(device) criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized # optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9) optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.002, momentum=0.9) # init distribute optimizer d_optimizer = hvd.DistributedOptimizer( optimizer=optimizer_ft, named_parameters=model_ft.named_parameters()) hvd.broadcast_parameters(model_ft.state_dict(), root_rank=0) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) total_cost_time = 0.0 total_count = 0.0 model_ft.train() for epoch in range(250): running_loss = 0.0 running_corrects = 0 for inputs, labels in dataloaders['train']: start_time = time.time() inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients d_optimizer.zero_grad() outputs = model_ft(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) loss.backward() d_optimizer.step() cost_time = int(round((time.time() - start_time) * 1000)) total_cost_time += cost_time total_count += 1 # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) print( 'Rank:{}, cost time:{}, avg tiem:{} epoch:{}, loss:{}'.format( hvd.rank(), cost_time, total_cost_time / total_count, epoch, loss.item())) print( '--------------------------------------------------------------------------' ) epoch_loss = running_loss / dataset_sizes['train'] epoch_acc = running_corrects.double() / dataset_sizes['train'] print('Rank:{}, {} Loss: {:.4f} Acc: {:.4f}'.format( hvd.local_rank(), 'train', epoch_loss, epoch_acc))
def local_rank(self) -> int: return hvd.local_rank()
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) # load DBs and image dirs all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) # train LOGGER.info(f"Loading Train Dataset " f"{opts.train_txt_dbs}, {opts.train_img_dbs}") train_datasets = [] for txt_path, img_path in zip(opts.train_txt_dbs, opts.train_img_dbs): img_db, img_db_gt = load_img_feat(img_path, all_img_dbs, opts) qa_txt_db = VcrTxtTokLmdb(txt_path, opts.max_txt_len, task="qa") qar_txt_db = VcrTxtTokLmdb(txt_path, opts.max_txt_len, task="qar") train_datasets.append( VcrDataset(qa_txt_db, img_db_gt=img_db_gt, img_db=img_db)) train_datasets.append( VcrDataset(qar_txt_db, img_db_gt=img_db_gt, img_db=img_db)) train_dataset = ConcatDatasetWithLens(train_datasets) train_dataloader = build_dataloader(train_dataset, vcr_collate, True, opts) # val LOGGER.info(f"Loading Val Dataset {opts.val_txt_db}, {opts.val_img_db}") val_img_db, val_img_db_gt = load_img_feat(opts.val_img_db, all_img_dbs, opts) val_txt_db = VcrTxtTokLmdb(opts.val_txt_db, -1) val_dataset = VcrEvalDataset("val", val_txt_db, img_db=val_img_db, img_db_gt=val_img_db_gt) val_final_dataset = VcrEvalDataset("test", val_txt_db, img_db=val_img_db, img_db_gt=val_img_db_gt) val_dataloader = build_dataloader(val_dataset, vcr_eval_collate, False, opts) val_final_dataloader = build_dataloader(val_final_dataset, vcr_eval_collate, False, opts) # Prepare model if opts.checkpoint and opts.checkpoint_from == "pretrain": checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} all_dbs = opts.train_txt_dbs + [opts.val_txt_db] toker = json.load(open(f'{all_dbs[0]}/meta.json'))['bert'] assert all(toker == json.load(open(f'{db}/meta.json'))['bert'] for db in all_dbs) model = UniterForVisualCommonsenseReasoning.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM) model.init_type_embedding() model.init_word_embedding(NUM_SPECIAL_TOKENS) if opts.checkpoint_from == "vcr_pretrain": checkpoint = torch.load(opts.checkpoint) state_dict = checkpoint.get('model_state', checkpoint) matched_state_dict = {} unexpected_keys = set() missing_keys = set() for name, param in model.named_parameters(): missing_keys.add(name) for key, data in state_dict.items(): if key in missing_keys: matched_state_dict[key] = data missing_keys.remove(key) else: unexpected_keys.add(key) print("Unexpected_keys:", list(unexpected_keys)) print("Missing_keys:", list(missing_keys)) model.load_state_dict(matched_state_dict, strict=False) del checkpoint model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'results')) # store VQA predictions add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataset) * hvd.size()) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(train_dataloader): n_examples += batch['input_ids'].size(0) loss = model(batch, compute_loss=True) loss = loss.mean() delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i == 0 or i == 1: param_group['lr'] = lr_this_step * opts.lr_mul elif i == 2 or i == 3: param_group['lr'] = lr_this_step else: raise ValueError() TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f'============Step {global_step}=============') tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec, global_step) LOGGER.info('===========================================') if global_step % opts.valid_steps == 0: val_log, results = validate(model, val_dataloader) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"finished {n_epoch} epochs") if global_step % opts.valid_steps != 0: val_log, results = validate(model, val_dataloader) TB_LOGGER.log_scaler_dict(val_log) val_log, results = validate(model, val_final_dataloader) with open( f'{opts.output_dir}/results/' f'results_{global_step}_final_qa_qar_' f'rank{rank}.json', 'w') as f: json.dump(results, f) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--fp16-allreduce', action='store_true', default=False, help='use fp16 compression during allreduce') parser.add_argument('--results_path', type=str, help="Path to store results") args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() if args.cuda: # Move model to GPU. model.cuda() # Horovod: broadcast parameters. hvd.broadcast_parameters(model.state_dict(), root_rank=0) global optimizer # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression) #for epoch in range(1, args.epochs + 1): # train(epoch, model, optimizer, train_sampler, train_loader, args) # test(model, test_sampler, test_loader, args) space = [(2, 8)] #if hvd.rank() == 0: hyperdrive( lambda hparams: objective( hparams, model, train_sampler, train_loader, args ), hyperparameters=space, results_path=args.results_path, checkpoints_path=args.results_path, model="GP", n_iterations=50, verbose=True, random_state=0 )
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) opts.n_gpu = n_gpu LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True set_random_seed(opts.seed) # data loaders train_dataloaders = {} val_dataloaders = {} for target, t_r in zip(opts.targets, opts.targets_ratio): train_loaders, val_loaders = build_target_loaders( target, t_r, opts) # -> choose which task and get corrsponding task dataloder train_dataloaders.update(train_loaders) val_dataloaders.update(val_loaders) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN if opts.load_partial_pretrained: # from roberta model = HeroForPretraining(VideoModelConfig(opts.model_config), vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=opts.lw_neg_ctx, lw_neg_q=opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=opts.hard_pool_size, margin=opts.margin, use_all_neg=opts.use_all_neg, drop_svmr_prob=opts.drop_svmr_prob) model.load_partial_pretrained(checkpoint, VFEAT_DIM, max_frm_seq_len, skip_layers=opts.skip_layer_loading) else: # continue training model = HeroForPretraining.from_pretrained( opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=opts.lw_neg_ctx, lw_neg_q=opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=opts.hard_pool_size, margin=opts.margin, use_all_neg=opts.use_all_neg, drop_svmr_prob=opts.drop_svmr_prob) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') restorer = TrainingRestorer(opts, model, optimizer) all_gather_list(None) # sync to prevent slower rank to read training meta global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) task2loss = { task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys() } for task in train_dataloaders.keys(): if task.startswith('vsm'): for obj in ('st_ed', 'neg_ctx', 'neg_q'): task2loss[f"{task}_{obj}"] = RunningMeter(f'loss/{task}_{obj}') model.train() n_examples = defaultdict(int) start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() if global_step == 0: optimizer.step() assert all(global_step == s for s in all_gather_list(global_step)) for step, (task, batch) in enumerate(meta_loader): LOGGER.debug(f"Task: {task}") # hard negative in VSM if len(opts.hard_negtiave_start_step) > 0: for i, hn_step in enumerate(opts.hard_negtiave_start_step): if global_step >= hn_step and hn_step != -1: model.set_hard_negative(True, opts.hard_pool_size[i], opts.hard_neg_weights[i]) # start-end loss if opts.train_span_start_step != -1 and\ global_step >= opts.train_span_start_step: model.set_train_st_ed(opts.lw_st_ed) train_task = task.split('_')[0] n_examples[task] += opts.train_batch_size loss = model(batch, task=train_task, compute_loss=True) if train_task == 'vsm': loss_st_ed, loss_neg_ctx, loss_neg_q = loss loss = loss_st_ed + loss_neg_ctx + loss_neg_q for n, ls, w in (('st_ed', loss_st_ed, opts.lw_st_ed), ('neg_ctx', loss_neg_ctx, opts.lw_neg_ctx), ('neg_q', loss_neg_q, opts.lw_neg_q)): ls = ls.item() if w: ls /= w task2loss[f'{task}_{n}'](ls) elif train_task == "mffr": loss = torch.sqrt(loss.sum(dim=1)) loss = loss.mean() task2loss[task](loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[task]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] LOGGER.debug("before reduce grad") all_reduce_and_rescale_tensors(grads, float(1)) LOGGER.debug("after reduce grad") if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: only consider rank 0 for speed TB_LOGGER.log_scaler_dict({ ll.name: ll.val for ll in task2loss.values() if ll.val is not None }) TB_LOGGER.step() LOGGER.debug("before norm grad") # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) LOGGER.debug("after norm grad") LOGGER.debug("before optim step") optimizer.step() optimizer.zero_grad() pbar.update(1) LOGGER.debug("after optim step") if global_step % 100 == 0: LOGGER.debug("after gather stats") # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') for t in train_dataloaders.keys(): tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) LOGGER.debug("after gather stats") if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, opts) LOGGER.info('===========================================') model_saver.save(model, global_step) # step restorer in the end to prevent missing validation checkpoint restorer.step() if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, opts) LOGGER.info('===========================================') model_saver.save(model, global_step)
def run_training_epoch(self): # get model model = self.get_model() # Epoch start events with self.profiler.profile('on_epoch_start'): # callbacks self.on_epoch_start() # model hooks if self.is_function_implemented('on_epoch_start'): model.on_epoch_start() # track local dataloader so TPU can wrap each epoch train_dataloader = self.train_dataloader # on TPU we have to wrap it under the ParallelLoader if self.use_tpu and self.tpu_id is None: device = xm.xla_device() train_dataloader = xla_pl.ParallelLoader(train_dataloader, [device]) train_dataloader = train_dataloader.per_device_loader(device) # bookkeeping outputs = [] # run epoch for batch_idx, (batch, is_last_batch) in self.profiler.profile_iterable( enumerate(_with_is_last(train_dataloader)), "get_train_batch"): # stop epoch if we limited the number of training batches if batch_idx >= self.num_training_batches: break self.batch_idx = batch_idx model.global_step = self.global_step # --------------- # RUN TRAIN STEP # --------------- _outputs = self.run_training_batch(batch, batch_idx) batch_result, grad_norm_dic, batch_step_metrics, batch_output = _outputs # only track outputs when user implements training_epoch_end # otherwise we will build up unnecessary memory if self.is_overridden('training_epoch_end', model=self.get_model()): outputs.append(batch_output) # when returning -1 from train_step, we end epoch early early_stop_epoch = batch_result == -1 # TODO: consolidate all actions that need to take place only after # self.accumulate_grad_batches steps (optimizer step, lr update, global step increment) if (self.batch_idx + 1) % self.accumulate_grad_batches == 0: # update lr self.update_learning_rates(interval='step') # --------------- # RUN VAL STEP # --------------- is_val_check_batch = (batch_idx + 1) % self.val_check_batch == 0 can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0 can_check_val = not self.disable_validation and can_check_epoch should_check_val = is_val_check_batch or early_stop_epoch should_check_val = should_check_val or ( is_last_batch and self.val_check_batch == float('inf')) should_check_val = can_check_val and should_check_val # --------------- # CHECKPOINTING, EARLY STOPPING # --------------- # fast_dev_run always forces val checking after train batch if self.fast_dev_run or should_check_val: self.run_evaluation(test_mode=self.testing) self.call_checkpoint_callback() self.call_early_stop_callback() # when logs should be saved should_save_log = ( batch_idx + 1) % self.log_save_interval == 0 or early_stop_epoch if should_save_log or self.fast_dev_run: if self.proc_rank == 0 and self.logger is not None: self.logger.save() # when metrics should be logged should_log_metrics = batch_idx % self.row_log_interval == 0 or early_stop_epoch if should_log_metrics or self.fast_dev_run: # logs user requested information to logger self.log_metrics(batch_step_metrics, grad_norm_dic) # progress global step according to grads progress if (self.batch_idx + 1) % self.accumulate_grad_batches == 0: self.global_step += 1 self.total_batch_idx += 1 # max steps reached, end training if self.max_steps is not None and self.max_steps == self.global_step: break # end epoch early # stop when the flag is changed or we've gone past the amount # requested in the batches if early_stop_epoch or self.fast_dev_run: break if self.use_horovod: hvd.join(hvd.local_rank() if self.on_gpu else -1) # process epoch outputs model = self.get_model() if self.is_overridden('training_epoch_end', model=model): epoch_output = model.training_epoch_end(outputs) _processed_outputs = self.process_output(epoch_output) log_epoch_metrics = _processed_outputs[2] callback_epoch_metrics = _processed_outputs[3] self.log_metrics(log_epoch_metrics, {}) self.callback_metrics.update(callback_epoch_metrics) self.add_progress_bar_metrics(_processed_outputs[1]) # when no val loop is present or fast-dev-run still need to call checkpoints if not self.is_overridden('validation_step') and not ( self.fast_dev_run or should_check_val): self.call_checkpoint_callback() self.call_early_stop_callback() # Epoch end events with self.profiler.profile('on_epoch_end'): # callbacks self.on_epoch_end() # model hooks if self.is_function_implemented('on_epoch_end'): model.on_epoch_end()
def train(serialized_model): import horovod.torch as hvd if random_seed is not None: pl.utilities.seed.seed_everything(seed=random_seed) # Horovod: initialize library. hvd.init() if verbose: import horovod as _horovod print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}") _checkpoint_callback = None require_checkpoint = False with remote_store.get_local_output_dir() as run_output_dir: logs_path = os.path.join(run_output_dir, remote_store.logs_subdir) os.makedirs(logs_path, exist_ok=True) print(f"Made directory {logs_path} for horovod rank {hvd.rank()}") ckpt_dir = run_output_dir ckpt_filename = remote_store.checkpoint_filename if logger is None: # Use default logger if no logger is supplied train_logger = TensorBoardLogger(logs_path) print(f"Setup logger: Using TensorBoardLogger: {train_logger}") elif isinstance(logger, CometLogger): if logger._experiment_key: # use logger passed in. train_logger = logger train_logger._save_dir = logs_path print(f"Setup logger: change save_dir of the logger to {logs_path}") elif logger_experiment_key: # Resume logger experiment with new log path if key passed correctly from CPU. train_logger = CometLogger( save_dir=logs_path, api_key=logger.api_key, experiment_key=logger_experiment_key, ) print(f"Setup logger: Resume comet logger: {vars(train_logger)}") else: print(f"Failed to setup or resume comet logger. origin logger: {vars(logger)}") else: # use logger passed in. train_logger = logger train_logger.save_dir = logs_path print(f"Setup logger: Using logger passed from estimator: {train_logger}") # Lightning requires to add checkpoint callbacks for all ranks. # Otherwise we are seeing hanging in training. for cb in callbacks: if isinstance(cb, ModelCheckpoint): cb.dirpath = ckpt_dir cb.filename = ckpt_filename _checkpoint_callback = cb require_checkpoint = True break if not _checkpoint_callback: # By default 'monitor'=None which saves a checkpoint only for the last epoch. _checkpoint_callback = ModelCheckpoint(dirpath=ckpt_dir, filename=ckpt_filename, verbose=True) callbacks.append(_checkpoint_callback) if remote_store.saving_runs and hvd.rank() == 0: # Horovod: sync checkpoint and logging files only on rank 0 to # prevent other ranks from corrupting them. class _SyncCallback(Callback): def on_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: remote_store.sync(run_output_dir) callbacks.append(_SyncCallback()) model = deserialize(serialized_model) _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \ int(math.floor(float(train_rows) / batch_size / hvd.size())) _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \ int(math.floor(float(val_rows) / val_batch_size / hvd.size())) shuffle_size = calculate_shuffle_buffer_size() if verbose: print(f"Training data of rank[{hvd.local_rank()}]: Epochs: {epochs}, " f"Shuffle_size: {shuffle_size}, Random seed: {random_seed}\n" f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {_train_steps_per_epoch}\n" f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {_val_steps_per_epoch}\n" f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n") if not should_use_gpu and verbose: print("Skip pinning current process to the GPU.") cuda_available = torch.cuda.is_available() if cuda_available and not should_use_gpu: print("GPU is available but use_gpu is set to False." "Training will proceed without GPU support.") cuda_available = False # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device(_get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() _num_gpus = num_gpus if _num_gpus is None: _num_gpus = 1 if cuda_available else 0 # Set bar refresh to 1 / epoch, detailed loss and metrics is avaialbe in logger, # no need to print in screen here. User can still override this in trainer_args progress_bar_refresh_rate = _train_steps_per_epoch kwargs = {'accelerator': 'horovod', 'gpus': _num_gpus, 'callbacks': callbacks, 'max_epochs': epochs, 'logger': train_logger, 'log_every_n_steps': log_every_n_steps, 'num_sanity_val_steps': 0, 'reload_dataloaders_every_epoch': False, 'progress_bar_refresh_rate': progress_bar_refresh_rate, 'terminate_on_nan': terminate_on_nan, 'profiler': profiler } if trainer_args: kwargs.update(trainer_args) if verbose and hvd.rank() == 0: print("Creating trainer with: \n ", kwargs) trainer = Trainer(**kwargs) if profiler != 'simple' and trainer.profiler: print(f"Set profiler's logs_path for {hvd.rank()} to {logs_path}") trainer.profiler.dirpath = logs_path # filename where the profiler results will be saved instead of # printing to stdout. The .txt extension will be used automatically. trainer.profiler.filename = "profile" if verbose and hvd.rank() == 0: print(f"pytorch_lightning version={pl.__version__}") data_module_kwargs = { 'train_dir': remote_store.train_data_path, 'val_dir': remote_store.val_data_path, 'num_train_epochs': epochs, 'has_val': should_validate is not None, 'train_batch_size': batch_size, 'val_batch_size': val_batch_size, 'shuffle_size': shuffle_size, 'num_reader_epochs': loader_num_epochs, 'reader_pool_type': reader_pool_type, 'reader_worker_count': train_reader_worker_count, 'transformation': transformation, 'inmemory_cache_all': inmemory_cache_all, 'cur_shard': hvd.rank(), 'shard_count': hvd.size(), 'schema_fields': schema_fields, 'storage_options': storage_options, 'steps_per_epoch_train': _train_steps_per_epoch, 'steps_per_epoch_val': _val_steps_per_epoch, 'verbose': verbose, 'debug_data_loader': debug_data_loader, 'train_async_data_loader_queue_size': train_async_data_loader_queue_size, 'val_async_data_loader_queue_size': val_async_data_loader_queue_size, } if debug_data_loader and hvd.rank() == 0: print(f"Creating data module with args:\n {data_module_kwargs}") dataset = data_module(**data_module_kwargs) trainer.fit(model, dataset) if hvd.rank() == 0: if remote_store.saving_runs and trainer.profiler: # One more file sync to push profiler result. remote_store.sync(logs_path) # rank 0 overwrites model with best checkpoint and returns. if require_checkpoint: if verbose: print("load from checkpoint best model path:", _checkpoint_callback.best_model_path) best_model = model.load_from_checkpoint(_checkpoint_callback.best_model_path) else: best_model = model serialized_checkpoint = io.BytesIO() module = best_model if not is_legacy else best_model._model output = {'model': module.state_dict(), 'logged_metrics': trainer.logged_metrics} torch.save(output, serialized_checkpoint) return serialized_checkpoint
def training_forward(self, batch, batch_idx, opt_idx, hiddens): """ Handle forward for each training case (distributed, single gpu, etc...) :param batch: :param batch_idx: :return: """ # --------------- # FORWARD # --------------- # enable not needing to add opt_idx to training_step args = [batch, batch_idx] if len(self.optimizers) > 1: if self.has_arg('training_step', 'optimizer_idx'): args.append(opt_idx) else: num_opts = len(self.optimizers) raise ValueError( f'Your LightningModule defines {num_opts} optimizers but ' f'training_step is missing the "optimizer_idx" argument.') # pass hiddens if using tbptt if self.truncated_bptt_steps is not None: args.append(hiddens) # distributed forward if self.use_ddp or self.use_ddp2 or self.use_dp: output = self.model(*args) # Horovod elif self.use_horovod and self.on_gpu: batch = self.transfer_batch_to_gpu(batch, hvd.local_rank()) args[0] = batch output = self.model.training_step(*args) # single GPU forward elif self.single_gpu: gpu_id = 0 if isinstance(self.data_parallel_device_ids, list): gpu_id = self.data_parallel_device_ids[0] # Don't copy the batch since there is a single gpu that the batch could # be referenced from and if there are multiple optimizers the batch will # wind up copying it to the same device repeatedly. batch = self.transfer_batch_to_gpu(batch, gpu_id) args[0] = batch output = self.model.training_step(*args) # TPU support elif self.use_tpu: batch = self.transfer_batch_to_tpu(batch, self.tpu_id) args[0] = batch output = self.model.training_step(*args) # CPU forward else: output = self.model.training_step(*args) # allow any mode to define training_step_end # do something will all the dp outputs (like softmax) if self.is_overridden('training_step_end'): model_ref = self.get_model() with self.profiler.profile('training_step_end'): output = model_ref.training_step_end(output) # allow any mode to define training_end # TODO: remove in 1.0.0 if self.is_overridden('training_end'): model_ref = self.get_model() with self.profiler.profile('training_end'): output = model_ref.training_end(output) rank_zero_warn( '`training_end` was deprecated in 0.7.0 and will be removed 1.0.0.' ' Use training_epoch_end instead', DeprecationWarning) return output
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-data", help="data yaml file") parser.add_argument("-dataPath", default='', type=str, help="path of data files") parser.add_argument("-seed_model", default='', help="the seed nerual network model") parser.add_argument("-exp_dir", help="the directory to save the outputs") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument( "-ali_dir", help="the directory to load trans_model and tree used for alignments") parser.add_argument("-lang_dir", help="the lexicon directory to load L.fst") parser.add_argument( "-chain_dir", help= "the directory to load trans_model, tree and den.fst for chain model") parser.add_argument("-lr", type=float, help="set the base learning rate") parser.add_argument( "-warmup_steps", default=4000, type=int, help="the number of warmup steps to adjust the learning rate") parser.add_argument("-xent_regularize", default=0, type=float, help="cross-entropy regularization weight") parser.add_argument("-momentum", default=0, type=float, help="set the momentum") parser.add_argument("-weight_decay", default=1e-4, type=float, help="set the L2 regularization weight") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=100, type=float, help="process n hours of data per sweep (default:100)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument( "-anneal_lr_epoch", default=2, type=int, help="start to anneal the learning rate from this epoch") parser.add_argument("-anneal_lr_ratio", default=0.5, type=float, help="the ratio to anneal the learning rate ratio") parser.add_argument('-print_freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('-save_freq', default=1000, type=int, metavar='N', help='save model frequency (default: 1000)') args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size print("pytorch version:{}".format(th.__version__)) with open(args.data) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] if 'dir_noise' in data: config["dir_noise_paths"] = [ j for i, j in data['dir_noise'].items() ] if 'rir' in data: config["rir_paths"] = [j for i, j in data['rir'].items()] config['data_path'] = args.dataPath print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) dataset = SpeechDataset(config) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset.transform = transform train_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, num_workers=args.data_loader_threads, distributed=True, test_only=False) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) # ceate model model_config = config["model_config"] model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model.cuda() # setup the optimizer optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) if os.path.isfile(args.seed_model): checkpoint = th.load(args.seed_model) state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): header = k[:7] name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v if header == "module.": model.load_state_dict(new_state_dict) else: model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.seed_model)) ali_model = args.ali_dir + "/final.mdl" ali_tree = args.ali_dir + "/tree" L_fst = args.lang_dir + "/L.fst" disambig = args.lang_dir + "/phones/disambig.int" den_fst = kaldi_fst.StdVectorFst.read(args.chain_dir + "/den.fst") chain_model_path = args.chain_dir + "/0.trans_mdl" chain_tree_path = args.chain_dir + "/tree" if os.path.isfile(chain_model_path): chain_trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(chain_model_path) as ki: chain_trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (trans_model)) sys.exit(0) chain_tree = kaldi_tree.ContextDependency() with kaldi_util.io.xopen(chain_tree_path) as ki: chain_tree.read(ki.stream(), ki.binary) # chain supervision options supervision_opts = kaldi_chain.SupervisionOptions() supervision_opts.convert_to_pdfs = True supervision_opts.frame_subsampling_factor = 3 supervision_opts.left_tolerance = 5 supervision_opts.right_tolerance = 5 # chain training options chain_opts = kaldi_chain.ChainTrainingOptions() chain_opts.leaky_hmm_coefficient = 1e-4 chain_opts.xent_regularize = args.xent_regularize # setup the aligner aligner = kaldi_align.MappedAligner.from_files(ali_model, ali_tree, L_fst, None, disambig, None, beam=10, transition_scale=1.0, self_loop_scale=0.1, acoustic_scale=0.1) den_graph = kaldi_chain.DenominatorGraph(den_fst, model_config["label_size"]) #encoder_layer = nn.TransformerEncoderLayer(512, 8) #print(encoder_layer) model.train() for epoch in range(args.num_epochs): # anneal learning rate if epoch > args.anneal_lr_epoch: for param_group in optimizer.param_groups: param_group['lr'] *= args.anneal_lr_ratio run_train_epoch(model, optimizer, train_dataloader, epoch, chain_trans_model, chain_tree, supervision_opts, aligner, den_graph, chain_opts, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/chain.model.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
def on_train_epoch_end(self, outputs): hvd.join(hvd.local_rank() if self.trainer.on_gpu else -1)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-data", help="data yaml file") parser.add_argument("-data_path", default='', type=str, help="path of data files") parser.add_argument("-seed_model", help="the seed nerual network model") parser.add_argument("-exp_dir", help="the directory to save the outputs") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument("-criterion", type=str, choices=["mmi", "mpfe", "smbr"], help="set the sequence training crtierion") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument( "-prior_path", help="the prior for decoder, usually named as final.occs in kaldi setup" ) parser.add_argument( "-den_dir", help="the decoding graph directory to find HCLG and words.txt files") parser.add_argument("-lr", type=float, help="set the learning rate") parser.add_argument("-ce_ratio", default=0.1, type=float, help="the ratio for ce regularization") parser.add_argument("-momentum", default=0, type=float, help="set the momentum") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=100, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument('-print_freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('-save_freq', default=1000, type=int, metavar='N', help='save model frequency (default: 1000)') args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config['data_path'] = args.data_path config["sweep_size"] = args.sweep_size print("pytorch version:{}".format(th.__version__)) with open(args.data) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) dataset = SpeechDataset(config) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset.transform = transform train_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, num_workers=args.data_loader_threads, distributed=True, test_only=False) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) # ceate model model_config = config["model_config"] model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model.cuda() # setup the optimizer optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) if os.path.isfile(args.seed_model): checkpoint = th.load(args.seed_model) state_dict = checkpoint['model'] model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.seed_model)) else: sys.stderr.write('ERROR: The model file %s does not exist!\n' % (model_file)) sys.exit(0) HCLG = args.den_dir + "/HCLG.fst" words_txt = args.den_dir + "/words.txt" silence_phones = args.den_dir + "/phones/silence.csl" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if not os.path.isfile(silence_phones): sys.stderr.write('ERROR: The silence phone file %s does not exist!\n' % (silence_phones)) sys.exit(0) with open(silence_phones) as f: silence_ids = [int(i) for i in f.readline().strip().split(':')] f.close() if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = False #To produce raw state-level lattice instead of compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) prior = kaldi_util.io.read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) model.train() for epoch in range(args.num_epochs): run_train_epoch(model, optimizer, log_prior.cuda(), train_dataloader, epoch, asr_decoder, trans_model, silence_ids, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.se.' + str(epoch) + '.tar' th.save(checkpoint, output_file)