def all_gather_stats_list(stat_list, max_size=4096): """ Gather a `Statistics` list accross all processes/nodes Args: stat_list(list([`Statistics`])): list of statistics objects to gather accross all processes/nodes max_size(int): max buffer size to use Returns: our_stats(list([`Statistics`])): list of updated stats """ from torch.distributed import get_rank from onmt.utils.distributed import all_gather_list # Get a list of world_size lists with len(stat_list) Statistics objects all_stats = all_gather_list(stat_list, max_size=max_size) our_rank = get_rank() our_stats = all_stats[our_rank] for other_rank, stats in enumerate(all_stats): if other_rank == our_rank: continue for i, stat in enumerate(stats): our_stats[i].update(stat, update_n_src_words=True) return our_stats
def _init_group_test(self): group = [1, 2] group_id = dist.new_group(group) rank = dist.get_rank() if rank not in group: return ([], None, rank) return (group, group_id, rank)
def __init__(self, num_replicas=None, rank=None): if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.extra = 0
def test_mpi(): dist.init_process_group('mpi') world_size = dist.get_world_size() rank = dist.get_rank() vector = [0] * world_size vector[rank] = 1 vector = torch.DoubleTensor(vector) dist.all_reduce(vector, op=dist.reduce_op.SUM) print("Host {} : Rank {} : {}".format(get_hostname(), rank, vector))
def __init__(self, dataset, num_replicas=None, rank=None): if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def test_get_rank(self): test_dir = os.path.join(TEMP_DIR, 'test_dir') pid = str(os.getpid()) num_processes = dist.get_world_size() with open(os.path.join(test_dir, pid), 'w') as f: f.write(str(dist.get_rank())) self._barrier() all_ranks = set() for f_name in os.listdir(test_dir): with open(os.path.join(test_dir, f_name), 'r') as f: all_ranks.add(int(f.read())) self.assertEqual(len(all_ranks), num_processes) self._barrier() if dist.get_rank() == 0: for f_name in os.listdir(test_dir): os.unlink(os.path.join(test_dir, f_name)) self._barrier()
def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None): """ Samples batches assuming they are in order of size to batch similarly sized samples together. """ super(DistributedBucketingSampler, self).__init__(data_source) if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.data_source = data_source self.ids = list(range(0, len(data_source))) self.batch_size = batch_size self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)] self.num_replicas = num_replicas self.rank = rank self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def test_send_recv(self): rank = dist.get_rank() tensor = _build_tensor(rank + 1) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(src + 1, value=-1) expected_tensor = _build_tensor(src + 1) dist.recv(tensor, src) self.assertEqual(tensor, expected_tensor) self._barrier()
def test_isend(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: requests = [ dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size) ] for request in requests: request.wait() self.assertTrue(request.is_completed()) else: tensor = _build_tensor(rank, -1) dist.recv(tensor, 0) self.assertEqual(tensor, _build_tensor(rank, 10)) self._barrier()
def config_pytorch(options): """Config pytorch packages. Fix random number for packages and initialize distributed environment for pytorch. Setup cuda environment for pytorch. :param options: A global object containing specified options. :type options: argparse.Namespace """ # Setting `cudnn.deterministic = True` will turn on # CUDNN deterministic setting which can slow down training considerably. # Unexpected behavior may also be observed from checkpoint. # See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py if options.cudnn_deterministic: cudnn.deterministic = True log.warning('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.', 0) if options.seed is not None: random.seed(options.seed) torch.manual_seed(options.seed) # define the graph for the computation. if options.use_cuda: assert torch.cuda.is_available() options.rank = dist.get_rank() options.world_size = dist.get_world_size() options.graph = FCGraph(options) # enable cudnn accelerator if we are using cuda. if options.use_cuda: options.graph.assigned_gpu_id() torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True if torch.backends.cudnn.version() is None: log.warning("CUDNN not found on device.") log.info("World size={}, Rank={}, hostname={}, cuda_available={}, cuda_device={}".format( options.world_size, options.rank, socket.gethostname(), torch.cuda.is_available(), torch.cuda.current_device()))
def test_send_recv_any_source(self): rank = dist.get_rank() tensor = _build_tensor(10, rank) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) recv_ranks = set() for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(10, value=-1) dist.recv(tensor) recv_ranks.add(tensor.resize_(1)[0]) self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) self._barrier()
def test_irecv(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)] requests = [ dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size) ] for src in range(1, world_size): requests[src - 1].wait() self.assertTrue(requests[src - 1].is_completed()) self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10)) else: tensor = _build_tensor(rank, 10) dist.send(tensor, 0) self._barrier()
def filter(self, record): record.rank = dist.get_rank() return True
def warning(content, who='all'): if who == 'all' or who == dist.get_rank(): logger.warning("{}".format(content))
def info(content, who='all'): if who == 'all' or who == dist.get_rank(): logger.info(content)
def train(args): world_size = len(args.hosts) is_distributed = world_size > 1 logger.debug('Number of hosts {}. Distributed training - {}'.format(world_size, is_distributed)) use_cuda = args.num_gpus > 0 logger.debug('Number of gpus available - {}'.format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device('cuda' if use_cuda else 'cpu') if is_distributed: # Initialize the distributed environment. backend = 'gloo' os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=backend, rank=host_rank, world_size=world_size) logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( backend, dist.get_world_size()) + 'Current host rank is {}. Is cuda available: {}. Number of gpus: {}'.format( dist.get_rank(), torch.cuda.is_available(), args.num_gpus)) # set the seed for generating random numbers seed = 1 torch.manual_seed(seed) if use_cuda: torch.cuda.manual_seed(seed) train_sampler, train_loader = _get_train_data_loader(args.data_dir, is_distributed, args.batch_size, **kwargs) test_loader = _get_test_data_loader(args.data_dir, **kwargs) logger.debug('Processes {}/{} ({:.0f}%) of train data'.format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset) )) logger.debug('Processes {}/{} ({:.0f}%) of test data'.format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset) )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case logger.debug('Multi-machine multi-gpu: using DistributedDataParallel.') model = torch.nn.parallel.DistributedDataParallel(model) elif use_cuda: # single-machine multi-gpu case logger.debug('Single-machine multi-gpu: using DataParallel().cuda().') model = torch.nn.DataParallel(model) else: # single-machine or multi-machine cpu case logger.debug('Single-machine/multi-machine cpu: using DataParallel.') model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.5) log_interval = 100 for epoch in range(1, args.epochs + 1): if is_distributed: train_sampler.set_epoch(epoch) model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % log_interval == 0: logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) accuracy = test(model, test_loader, device) save_model(model, args.model_dir) logger.debug('Overall test accuracy: {}'.format(accuracy))
def train(train_loader, r, optimizer, epoch): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode n = r.num_iterations(loader_size=len(train_loader)) if args.num_minibatches is not None: n = min(n, args.num_minibatches) r.train(n) if not is_first_stage(): train_loader = None r.set_loader(train_loader) end = time.time() epoch_start_time = time.time() if args.no_input_pipelining: num_warmup_minibatches = 0 else: num_warmup_minibatches = r.num_warmup_minibatches if args.verbose_frequency > 0: logging.info("Letting in %d warm-up minibatches" % num_warmup_minibatches) logging.info("Running training for %d minibatches" % n) # start num_warmup_minibatches forward passes for i in range(num_warmup_minibatches): r.run_forward() for i in range(n - num_warmup_minibatches): # perform forward pass r.run_forward() # Adjust learning rate adjust_learning_rate(optimizer, epoch, args.epochs, r, args.lr_policy, i, n) if is_last_stage(): # measure accuracy and record loss output, target, loss = r.output, r.target, r.loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), output.size(0)) top1.update(prec1[0], output.size(0)) top5.update(prec5[0], output.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() epoch_time = (end - epoch_start_time) / 3600.0 full_epoch_time = (epoch_time / float(i + 1)) * float(n) if i % args.print_freq == 0: logging.info( 'Epoch: [{0}][{1}/{2}]\t' 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Epoch time [hr]: {epoch_time:.3f} ({full_epoch_time:.3f})\t' 'Memory: {memory:.3f}G ({cached_memory:.3f}G)\t' 'Loss: {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1: {top1.val:.2f}% ({top1.avg:.2f}%)\t' 'Prec@5: {top5.val:.2f}% ({top5.avg:.2f}%)'.format( epoch, i, n, batch_time=batch_time, epoch_time=epoch_time, full_epoch_time=full_epoch_time, loss=losses, top1=top1, top5=top5, memory=(float(torch.cuda.memory_allocated()) / 10**9), cached_memory=(float(torch.cuda.memory_cached()) / 10**9))) import sys sys.stdout.flush() #print(losses.avg, i) #print(top1.avg, i) else: if i % args.print_freq == 0: logging.info( 'Epoch: [{0}][{1}/{2}]\tMemory: {memory:.3f}G ({cached_memory:.3f}G)' .format(epoch, i, n, memory=(float(torch.cuda.memory_allocated()) / 10**9), cached_memory=(float(torch.cuda.memory_cached()) / 10**9))) import sys sys.stdout.flush() # perform backward pass if args.fp16: r.zero_grad() else: optimizer.zero_grad() # consistent # optimizer.load_old_params() r.run_backward() # optimizer.load_new_params() # s = optimizer.get_s() if args.square: s = (dist.get_world_size() - dist.get_rank())**2 else: s = None #logging.warning(f'outside: {dist.get_rank()}: {args.find_median}, s = {s}') if args.spectrain: optimizer.step(s=s, find_median=args.find_median) else: optimizer.step() # inconsistent # optimizer.load_old_params() # r.run_backward() # optimizer.load_new_params() # optimizer.step() global writer if dist.get_rank() == dist.get_world_size() - 1: writer.add_scalar('Train/Loss', losses.avg, epoch) writer.add_scalar('Train/Accuracy', top1.avg, epoch) # finish remaining backward passes for i in range(num_warmup_minibatches): optimizer.zero_grad() # optimizer.load_old_params() r.run_backward() # optimizer.load_new_params() if args.spectrain: optimizer.step(s=s, find_median=args.find_median) else: optimizer.step() # wait for all helper threads to complete r.wait() logging.info("Epoch %d: %.3f seconds" % (epoch, time.time() - epoch_start_time)) logging.info("Epoch start time: %.3f, epoch end time: %.3f" % (epoch_start_time, time.time()))
hidden_dim = 128 input_steps = segment_size output_steps = segment_size input_size = 1 output_size = 1 train_idx = list(range(training_size)) valid_idx = list(range(training_size, train_valid_size)) test_idx = list(range(train_valid_size, train_valid_size + test_size)) encoder = Encoder(input_size, hidden_dim, num_layers, dropout_rate) decoder = Decoder(output_size, hidden_dim, num_layers, dropout_rate) # to enable multi GPU training rank = dist.get_rank() world_size = dist.get_world_size() print('rank {} and world size {}'.format(rank, world_size)) dist.init_process_group("gloo", rank=rank, world_size=world_size) print(rank, world_size) model = Seq2Seq(encoder, decoder, rank).to(rank) model = DDP(model, device_ids=[rank]) model, loss, preds, min_valid_loss, test_rmse = train_model( model, X, Y, learning_rate, output_steps=output_steps, batch_size=64,
def train(train_loader, model, criterion, optimizer, epoch, log_writer): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() global iter_ptr train_record.set() # switch to train mode model.train() end = time.time() torch.cuda.synchronize() i = -1 #while input is not None: for input, target in train_loader: assert input.size(0) == target.size(0) i += 1 iter_ptr += 1 if args.prof and (i > 200): break # measure data loading time data_time.update(time.time() - end) input = input.cuda(async=True) target = target.cuda(async=True) input_var = Variable(input) target_var = Variable(target) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) if args.distributed: reduced_loss = reduce_tensor(loss.data) #reduced_loss = loss.data prec1 = reduce_tensor(prec1) prec5 = reduce_tensor(prec5) else: reduced_loss = loss.data losses.update(to_python_float(reduced_loss), input.size(0)) top1.update(to_python_float(prec1), input.size(0)) top5.update(to_python_float(prec5), input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() #input, target = prefetcher.next() if dist.get_rank() == 0 and i % args.print_freq == 0 and i > 1: train_record.record() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t' 'Total Training Time {train_time:.3f}'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, train_time=train_record.get_time())) train_record.set() if log_writer: log_writer.add_scalar('train_iter/top1', top1.get_avg(), iter_ptr) log_writer.add_scalar('train_iter/top5', top5.get_avg(), iter_ptr) log_writer.add_scalar('train_iter/loss', losses.get_avg(), iter_ptr) log_writer.add_scalar('train_iter/batch_time', batch_time.get_avg(), iter_ptr) log_writer.add_scalar('train_iter/data_time', data_time.get_avg(), iter_ptr) log_writer.add_scalar('train_iter/learning_rate_schedule', args.lr_present, iter_ptr) log_writer.add_scalar('train_epoch/top1', top1.get_avg(), epoch) log_writer.add_scalar('train_epoch/top5', top5.get_avg(), epoch) log_writer.add_scalar('train_epoch/loss', losses.get_avg(), epoch) log_writer.add_scalar('train_epoch/learning_rate_schedule', args.lr_present, epoch) log_writer.add_scalar('train_time/top1', top1.get_avg(), train_record.get_time()) log_writer.add_scalar('train_time/top5', top5.get_avg(), train_record.get_time()) log_writer.add_scalar('train_time/loss', losses.get_avg(), train_record.get_time()) if args.larc_enable: #add larc_adaptive_lr saving laryer_saving_name = ['layer0.conv1.weight', 'layer0.bn1.weight', 'layer1.1.conv1.weight', \ 'layer2.1.conv1.weight', 'layer3.1.conv1.weight', 'layer4.1.conv1.weight'] #correspond to list laryer_saving in Signum_SGD.py for index, layer_lr in enumerate(optimizer.layer_adaptive_lr): log_writer.add_scalar('larc_layer_adaptive_lr/' + laryer_saving_name[index], layer_lr, epoch)
def forward(self, input, input_mask=None, attention_mask=None, head_mask=None, layer_past=None, get_key_value=False, get_present=False, encoder_output=None, enc_dec_attn_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, use_cache=False, output_attentions=False): get_present = (get_present or get_key_value or use_cache) input_mask = input_mask if attention_mask is None else attention_mask input_type = input.dtype if (self.config.fp16 or self.config.q_int8) \ and input.dtype == torch.float: input = input.half() with torch.no_grad(): attention_output = self.attention(input, input_mask, head_mask, layer_past, get_present, encoder_hidden_states, encoder_attention_mask, output_attentions, self.norm_w, self.norm_b) if get_present: attention_output, p_key, p_value = attention_output[0:3] presents = (p_key, p_value) elif output_attentions: attention_output, _, _, context_output = attention_output[0:4] else: attention_output = attention_output[0] residual_add = attention_output + self.attention.attn_ob attention_output = self.ds_layernorm(residual_add, self.attn_nw, self.attn_nb, self.config.epsilon) if self.config.mlp_type == 'residual': res_mlp_out = self.res_mlp(attention_output, async_op=True) res_coef_out = self.res_coef_func(attention_output, async_op=True) if self.expert_mp_group is not None: tensor_list = [ torch.empty_like(attention_output) for _ in range( dist.get_world_size(group=self.expert_mp_group)) ] tensor_list[dist.get_rank( group=self.expert_mp_group)] = attention_output dist.all_gather(tensor_list, attention_output, group=self.expert_mp_group) attention_output = torch.cat(tensor_list).contiguous() ############## MoE Gating + Experts ############### dispatched_attention, combined_weights = self.moe_gate_einsum( attention_output) dispatched_input = self._alltoall(dispatched_attention) expert_outputs = self.expert_exec(dispatched_input) expert_output = self._alltoall(expert_outputs) output = self.scale_expert_output(attention_output, expert_output, combined_weights) ################################################ if self.expert_mp_group is not None: output = output.split( output.shape[0] // dist.get_world_size(group=self.expert_mp_group), dim=0)[dist.get_rank(group=self.expert_mp_group)] if self.config.mlp_type == 'residual': inference_cuda_module.moe_res_matmul(res_mlp_out, res_coef_out, output) output = self.bias_residual_func(output, residual_add, torch.empty(1)) if not self.config.pre_layer_norm: output = self.ds_layernorm(output, self.norm_w, self.norm_b, self.config.epsilon) if input_type != output.dtype: output = output.to(input_type) if get_present: output = (output, presents) if self.config.return_tuple: return output if type(output) is tuple else (output, ) else: return output
def main(run_id, validation_only=False): r"""Main logic.""" num_parallel_workers = 2 dataset_root = '/datasets/torch/cifar10' ckpt_run_dir = '/checkpoints/decentralized/cifar_resnet20' use_cuda = True train_epochs = 164 initialize_backends( comm_backend='mpi', logging_level='INFO', logging_file='/mlbench.log', use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir=ckpt_run_dir, delete_existing_ckpts=not validation_only) rank = dist.get_rank() world_size = dist.get_world_size() batch_size = 256 // world_size model = ResNetCIFAR( resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = optim.SGD( model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR( optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [ TopKAccuracy(topk=1), TopKAccuracy(topk=5) ] train_set = CIFAR10V1(dataset_root, train=True, download=True) val_set = CIFAR10V1(dataset_root, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer( ckpt_run_dir=ckpt_run_dir, rank=rank, checkpoint_all=True) if not validation_only: # Aggregation ring_neighbors = [(rank + 1) % world_size, (rank - 1) % world_size] agg_fn = DecentralizedAggregation( rank=rank, neighbors=ring_neighbors).agg_model controlflow = TrainValidation( model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=train_epochs, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=use_cuda, max_batch_per_epoch=None, agg_fn=agg_fn) controlflow.run( dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False) else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(ckpt_run_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(ckpt_run_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def train(verbose=True, **kwargs): args = kwargs['args'] torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:{}'.format(cfg.port), world_size=torch.cuda.device_count(), rank=args.local_rank) setup_logger(cfg.respth) logger = logging.getLogger() ## dataset ds = CityScapes(cfg, mode='train_val') sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size=cfg.ims_per_gpu, shuffle=False, sampler=sampler, num_workers=cfg.n_workers, pin_memory=True, drop_last=True) ## model net = EaNet(cfg) net.cuda() it_start = 0 n_epoch = 0 ## optimizer optim = Optimizer( net, cfg.lr_start, cfg.momentum, cfg.weight_decay, cfg.warmup_steps, cfg.warmup_start_lr, cfg.max_iter, cfg.lr_power, # start_iter = it_start ) ## resume if cfg.resume: print("=> loading checkpoint '{}'".format(cfg.resume)) checkpoint = torch.load(cfg.resume) if '.tar' in cfg.resume: net.load_state_dict(checkpoint['model']) optim.optim.load_state_dict(checkpoint['optimizer']) # it_start = checkpoint['it'] n_epoch = checkpoint['epoch'] bestMIOU = checkpoint['mIOU'] # optim.it = it_start print('Pth.Tar Load model from {}'.format(cfg.resume)) else: net.load_state_dict(checkpoint) print('Pth Load model from {}'.format(cfg.resume)) print('pretrained model loaded') net.eval() evaluator = MscEval(cfg) mIOU = evaluator(net) print('mIOU start from %f' % mIOU) del checkpoint net.train() net = nn.parallel.DistributedDataParallel(net, device_ids=[ args.local_rank, ], output_device=args.local_rank) n_min = cfg.ims_per_gpu * cfg.crop_size[0] * cfg.crop_size[1] // 16 #criteria = OhemCELoss(thresh=cfg.ohem_thresh, n_min=n_min).cuda() criteria = ECELoss(thresh=cfg.ohem_thresh, n_min=n_min, n_classes=cfg.n_classes, alpha=cfg.alpha, radius=cfg.radius, beta=cfg.beta, ignore_lb=cfg.ignore_label, mode=cfg.mode).cuda() ## train loop loss_avg = [] st = glob_st = time.time() diter = iter(dl) # n_epoch = 0 counter = 0 #count for the epoch finished #已经跑结束的epoch epochF = 0 bestMIOU = 0 for it in range(it_start, cfg.max_iter): try: im, lb = next(diter) if not im.size()[0] == cfg.ims_per_gpu: continue except StopIteration: n_epoch += 1 sampler.set_epoch(n_epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) try: optim.zero_grad() logits = net(im) loss = criteria(logits, lb) loss.backward() optim.step() except RuntimeError as e: if 'out of memory' in e: print('| WARNING: run out of memory') if hasattr(troch.cuda, 'empty_cach'): torch.cuda.empty_cache() else: raise e ''' logits = net(im) loss = criteria(logits, lb) loss = loss / (cfg.ims_per_gpu) counter += 1 loss.backward() if counter == cfg.ims_per_gpu: optim.step() optim.zero_grad() counter = 0 ''' loss_avg.append(loss.item()) ## print training log message if it % cfg.msg_iter == 0 and not it == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((cfg.max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'iter: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it, max_it=cfg.max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] st = ed #每隔一段时间评估一次 if n_epoch > epochF and n_epoch > 20: #置为相等的了 epochF = n_epoch #if (n_epoch > 35) and it%(5*cfg.msg_iter) == 0 and not it==0: # net.cpu() # save_pth = osp.join(cfg.respth, 'model_final_best.pth') # state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() # if dist.get_rank()==0: torch.save(state, save_pth) # logger.info('training done, model saved to: {}'.format(save_pth)) # logger.info('evaluating the final model') # net.cuda() net.eval() evaluator = MscEval(cfg) mIOU = evaluator(net) logger.info('mIOU is: {}'.format(mIOU)) # 保存check point save_pth = osp.join(cfg.respth, 'checkpoint.pth.tar') state = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() if dist.get_rank() == 0: stateF = { 'model': state, 'lr': optim.lr, 'mIOU': mIOU, 'it': it, 'epoch': n_epoch, 'optimizer': optim.optim.state_dict(), } torch.save(stateF, save_pth) if mIOU > bestMIOU: logger.info('Get a new best mIMOU:{} at epoch:{}'.format( bestMIOU, n_epoch)) #print('Get a new best mIMOU:{}'.format(bestMIOU)) bestMIOU = mIOU #net.cpu() save_pth = osp.join(cfg.respth, 'model_final_{}.pth'.format(n_epoch)) state = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) #重新加载到cuda #net.cuda() net.train() if verbose: net.cpu() save_pth = osp.join(cfg.respth, 'model_final.pth.rar') state = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() stateF = { 'model': state, 'lr': optim.lr, 'mIOU': mIOU, 'it': it, 'epoch': n_epoch, 'optimizer': optim.optim.state_dict(), } torch.save(stateF, save_pth) #if dist.get_rank()==0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth)) logger.info('evaluating the final model') net.cuda() net.eval() evaluator = MscEval(cfg) mIOU = evaluator(net) logger.info('mIOU is: {}'.format(mIOU))
def get_rank(): try: return dist.get_rank() except: return None
def train_epoch(self, data_loader): self.model.train() num_ckpt = int(np.ceil(len(data_loader) / 10)) meter_loss = tnt.meter.MovingAverageValueMeter( len(data_loader) // 100 + 1) #meter_accuracy = tnt.meter.ClassErrorMeter(accuracy=True) #meter_confusion = tnt.meter.ConfusionMeter(p.NUM_CTC_LABELS, normalized=True) if self.lr_scheduler is not None: self.lr_scheduler.step() logger.debug(f"current lr = {self.lr_scheduler.get_lr()}") if is_distributed() and data_loader.sampler is not None: data_loader.sampler.set_epoch(self.epoch) # count the number of supervised batches seen in this epoch t = tqdm(enumerate(data_loader), total=len(data_loader), desc="training") for i, (data) in t: loss_value = self.unit_train(data) meter_loss.add(loss_value) t.set_description(f"training (loss: {meter_loss.value()[0]:.3f})") t.refresh() #self.meter_accuracy.add(ys_int, ys) #self.meter_confusion.add(ys_int, ys) if 0 < i < len(data_loader) and i % num_ckpt == 0: if not is_distributed() or (is_distributed() and dist.get_rank() == 0): title = "train" x = self.epoch + i / len(data_loader) if logger.visdom is not None: logger.visdom.add_point(title=title, x=x, y=meter_loss.value()[0]) if logger.tensorboard is not None: logger.tensorboard.add_graph(self.model, xs) xs_img = tvu.make_grid(xs[0, 0], normalize=True, scale_each=True) logger.tensorboard.add_image('xs', x, xs_img) ys_hat_img = tvu.make_grid(ys_hat[0].transpose(0, 1), normalize=True, scale_each=True) logger.tensorboard.add_image('ys_hat', x, ys_hat_img) logger.tensorboard.add_scalars( title, x, { 'loss': meter_loss.value()[0], }) if self.checkpoint: logger.info( f"training loss at epoch_{self.epoch:03d}_ckpt_{i:07d}: " f"{meter_loss.value()[0]:5.3f}") if not is_distributed() or (is_distributed() and dist.get_rank() == 0): self.save( self.__get_model_name( f"epoch_{self.epoch:03d}_ckpt_{i:07d}")) #input("press key to continue") self.epoch += 1 logger.info(f"epoch {self.epoch:03d}: " f"training loss {meter_loss.value()[0]:5.3f} ") #f"training accuracy {meter_accuracy.value()[0]:6.3f}") if not is_distributed() or (is_distributed() and dist.get_rank() == 0): self.save(self.__get_model_name(f"epoch_{self.epoch:03d}")) self.__remove_ckpt_files(self.epoch - 1)
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info( f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}" ) # Load config config = BertConfig.from_json_file(args.config_file) # Load task config with open(args.tasks_config_file, "r") as f: task_cfg = edict(yaml.safe_load(f)) task_id = args.task.strip() task = "TASK" + task_id task_name = task_cfg[task]["name"] if task_cfg[task].get("fusion_method", None): # VL-BERT pooling for VQA config.fusion_method = task_cfg[task]["fusion_method"] # Output dirs timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name savePath = os.path.join(args.output_dir, timeStamp) if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Dataset batch_size, task2num_iters, dset_val, dl_val = LoadDatasetEval( args, config, task_cfg, args.task) # Logging tb_logger = tbLogger(timeStamp, savePath, [task_name], [task], task2num_iters, 1, save_logger=False, txt_name="eval.txt") # Model model = BertForVLTasks.from_pretrained(args.from_pretrained, config=config, task_cfg=task_cfg, task_ids=[task]) # Optimization details criterion = LoadLoss(task_cfg, args.task) # Move to GPU(s) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) # Print summary if default_gpu: print("***** Running evaluation *****") print(" Num Iters: ", task2num_iters[task]) print(" Batch size: ", batch_size) # Evaluate model.eval() results = [] others = [] for i, batch in tqdm(enumerate(dl_val), total=task2num_iters[task]): loss, score, batch_size, results, others = EvaluatingModel( config, task_cfg, device, task, batch, model, dl_val, criterion, results, others) tb_logger.step_val(0, float(loss), float(score), task, batch_size, "val") sys.stdout.write("%d/%d\r" % (i, len(dl_val))) sys.stdout.flush() # save the result or evaluate the result. ave_score = tb_logger.showLossVal(task) if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w"))
def load_model(args): # Prepare GLUE task args.task_name = args.task_name.lower() args.output_mode = "classification" label_list = ["0", "1"] num_labels = len(label_list) # store args if args.local_rank != -1: args.world_size = torch.distributed.get_world_size() args.rank = dist.get_rank() # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will # download model & vocab torch.distributed.barrier() args.train_model_type = args.train_model_type.lower() configObj = MSMarcoConfigDict[args.train_model_type] if 'fairseq' not in args.train_model_type: config = configObj.config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = configObj.tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = configObj.model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) elif 'fast' in args.train_model_type: config = configObj.config_class.from_pretrained( 'roberta-base', num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None, ) #print('???',args.model_name_or_path) model = configObj.model_class(config) #print('???',model.state_dict()['encoder.layers.1.fc2.weight']) #print('???',model.state_dict().keys()) if os.path.isdir(args.model_name_or_path): model.from_pretrained( os.path.join(args.model_name_or_path, args.model_file)) else: model.from_pretrained(os.path.join(args.model_name_or_path)) #print('???',model.state_dict()['encoder.layers.1.fc2.weight']) tokenizer = BertWordPieceTokenizer(args.bpe_vocab_file, clean_text=False, strip_accents=False, lowercase=False) else: config = configObj.config_class.from_pretrained( 'roberta-base', num_labels=args.num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None, ) model = configObj.model_class(config) model.from_pretrained( os.path.join(args.model_name_or_path, args.model_file)) tokenizer = torch.hub.load('pytorch/fairseq', 'roberta.base') if args.local_rank == 0: # Make sure only the first process in distributed training will # download model & vocab torch.distributed.barrier() model.to(args.device) return tokenizer, model
def _init_global_test(self): group = [i for i in range(0, dist.get_world_size())] group_id = dist.group.WORLD rank = dist.get_rank() return (group, group_id, rank)
def check_write_log(): return dist.get_rank( ) == 0 or not use_multigpu_with_single_device_per_process
def _run_train(device: torch.device, is_distributed: bool): serialize_dir = os.path.join(ROOT_PATH, "data/easytext/tests/trainer/save_and_load") if is_distributed: if TorchDist.get_rank() == 0: if os.path.isdir(serialize_dir): shutil.rmtree(serialize_dir) os.makedirs(serialize_dir) TorchDist.barrier() else: if os.path.isdir(serialize_dir): shutil.rmtree(serialize_dir) os.makedirs(serialize_dir) model = ModelDemo() optimizer_factory = _DemoOptimizerFactory() loss = _DemoLoss() metric = _DemoMetric() tensorboard_log_dir = "data/tensorboard" tensorboard_log_dir = os.path.join(ROOT_PATH, tensorboard_log_dir) # shutil.rmtree(tensorboard_log_dir) trainer = Trainer(num_epoch=100, model=model, loss=loss, metrics=metric, optimizer_factory=optimizer_factory, serialize_dir=serialize_dir, patient=20, num_check_point_keep=25, device=device, trainer_callback=None, is_distributed=is_distributed ) logging.info(f"test is_distributed: {is_distributed}") # trainer_callback = BasicTrainerCallbackComposite(tensorboard_log_dir=tensorboard_log_dir) train_dataset = _DemoDataset() if is_distributed: sampler = DistributedSampler(dataset=train_dataset) else: sampler = None train_data_loader = DataLoader(dataset=train_dataset, collate_fn=_DemoCollate(), batch_size=200, num_workers=0, sampler=sampler) if is_distributed: sampler = DistributedSampler(dataset=train_dataset) else: sampler = None validation_data_loader = DataLoader(dataset=train_dataset, collate_fn=_DemoCollate(), batch_size=200, num_workers=0, sampler=sampler) trainer.train(train_data_loader=train_data_loader, validation_data_loader=validation_data_loader) expect_model_state_dict = json.loads(json2str(trainer.model.state_dict())) expect_optimizer_state_dict = json.loads(json2str(trainer.optimizer.state_dict())) expect_current_epoch = trainer.current_epoch expect_num_epoch = trainer.num_epoch expect_metric = trainer.metrics.metric[0] expect_metric_tracker = json.loads(json2str(trainer.metric_tracker)) trainer.load_checkpoint(serialize_dir=serialize_dir) loaded_model_state_dict = json.loads(json2str(trainer.model.state_dict())) loaded_optimizer_state_dict = json.loads(json2str(trainer.optimizer.state_dict())) current_epoch = trainer.current_epoch num_epoch = trainer.num_epoch metric = trainer.metrics.metric[0] metric_tracker = json.loads(json2str(trainer.metric_tracker)) ASSERT.assertDictEqual(expect_model_state_dict, loaded_model_state_dict) ASSERT.assertDictEqual(expect_optimizer_state_dict, loaded_optimizer_state_dict) ASSERT.assertEqual(expect_current_epoch, current_epoch) ASSERT.assertEqual(expect_num_epoch, num_epoch) ASSERT.assertDictEqual(expect_metric, metric) ASSERT.assertDictEqual(expect_metric_tracker, metric_tracker)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="results", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_interbert.json", type=str, help="The config file which specified the model details.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument("--num_workers", type=int, default=16, help="Number of workers in the dataloader.") parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument("--tasks", default='', type=str, help="1-2-3... training task separate by -") parser.add_argument("--in_memory", default=False, type=bool, help="whether use chunck for parallel training.") parser.add_argument("--zero_shot", action="store_true", help="whether use single stream baseline.") parser.add_argument("--split", default="", type=str, help="which split to use.") parser.add_argument("--batch_size", default=1, type=int, help="which split to use.") args = parser.parse_args() with open('interbert_tasks.yml', 'r') as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) from bertmodel.modules import BertConfig task_names = [] for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id name = task_cfg[task]['name'] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] if '/' in args.from_pretrained: timeStamp = args.from_pretrained.split('/')[1] else: timeStamp = args.from_pretrained savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) bert_weight_name = json.load( open("config/" + "bert-base-uncased_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val \ = LoadDatasetEval(args, task_cfg, args.tasks.split('-')) num_labels = max( [dataset.num_labels for dataset in task_datasets_val.values()]) config.fast_mode = True if args.zero_shot: model = InterBertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config) else: model = InterBertForVLTasks.from_pretrained(args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu) task_losses = LoadLosses(args, task_cfg, args.tasks.split('-')) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, deay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: results = [] others = [] score_matrix = np.zeros((5000, 1000)) target_matrix = np.zeros((5000, 1000)) rank_matrix = np.ones((5000)) * 1000 count = 0 for i, batch in enumerate(task_dataloader_val[task_id]): batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, question, target, input_mask, segment_ids, caption_idx, image_idx = batch if task_id in ['TASK3']: batch_size = features.size(0) features = features.squeeze(0) spatials = spatials.squeeze(0) image_mask = image_mask.squeeze(0) multimodal_mask = torch.cat( (image_mask, input_mask.expand(image_mask.size(0), -1)), dim=-1) question = question.expand(features.size(0), -1) with torch.no_grad(): if args.zero_shot: _, _, vil_logit, _ = model(question, features, spatials, segment_ids, input_mask, image_mask, multimodal_mask=multimodal_mask) else: _, _, vil_logit, _, _, _, _ = model( question, features, spatials, segment_ids, input_mask, image_mask, multimodal_mask=multimodal_mask) score_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = torch.softmax( vil_logit, dim=1)[:, 0].view(-1).cpu().numpy() target_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = target.view(-1).float().cpu().numpy() if image_idx.item() == 1: rank = np.where( (np.argsort(-score_matrix[caption_idx]) == np.where( target_matrix[caption_idx] == 1)[0][0]) == 1)[0][0] rank_matrix[caption_idx] = rank rank_matrix_tmp = rank_matrix[:caption_idx + 1] r1 = 100.0 * np.sum( rank_matrix_tmp < 1) / len(rank_matrix_tmp) r5 = 100.0 * np.sum( rank_matrix_tmp < 5) / len(rank_matrix_tmp) r10 = 100.0 * np.sum( rank_matrix_tmp < 10) / len(rank_matrix_tmp) medr = np.floor(np.median(rank_matrix_tmp) + 1) meanr = np.mean(rank_matrix_tmp) + 1 print( "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (count, r1, r5, r10, medr, meanr)) results.append( np.argsort(-score_matrix[caption_idx]).tolist()[:20]) count += 1 r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix) r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix) r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix) medr = np.floor(np.median(rank_matrix) + 1) meanr = np.mean(rank_matrix) + 1 print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr)) print("************************************************") if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]['val_split']) json.dump(results, open(json_path + '_result.json', 'w')) json.dump(others, open(json_path + '_others.json', 'w'))
def main(): global args, best_prec1 args = parser.parse_args() if int(args.rank) == int(args.world_size) - 1: log_level = logging.INFO else: log_level = logging.WARNING logging.basicConfig( level=log_level, format="[%(asctime)s] %(name)s:%(levelname)s: %(message)s") logging.info(f'Find median: {args.find_median}') logging.warning( f'master addr: {args.master_addr}, rank:{args.rank}, local_rank:{args.local_rank}' ) torch.cuda.set_device(args.local_rank) # define loss function (criterion) criterion = nn.CrossEntropyLoss() # create stages of the model module = importlib.import_module(args.module) args.arch = module.arch() model = module.model(criterion) # determine shapes of all tensors in passed-in model if args.arch == 'inception_v3': input_size = [args.batch_size, 3, 299, 299] else: #input_size = [args.batch_size, 3, 224, 224] input_size = [args.batch_size, 3, 32, 32] training_tensor_shapes = { "input0": input_size, "target": [args.batch_size] } dtypes = {"input0": torch.int64, "target": torch.int64} inputs_module_destinations = {"input": 0} target_tensor_names = {"target"} for (stage, inputs, outputs) in model[:-1]: # Skip last layer (loss). input_tensors = [] for input in inputs: input_tensor = torch.zeros(tuple(training_tensor_shapes[input]), dtype=torch.float32) input_tensors.append(input_tensor) with torch.no_grad(): output_tensors = stage(*tuple(input_tensors)) if not type(output_tensors) is tuple: output_tensors = [output_tensors] for output, output_tensor in zip(outputs, list(output_tensors)): training_tensor_shapes[output] = list(output_tensor.size()) dtypes[output] = output_tensor.dtype eval_tensor_shapes = {} for key in training_tensor_shapes: eval_tensor_shapes[key] = tuple([args.eval_batch_size] + training_tensor_shapes[key][1:]) training_tensor_shapes[key] = tuple(training_tensor_shapes[key]) configuration_maps = { 'module_to_stage_map': None, 'stage_to_rank_map': None, 'stage_to_depth_map': None } if args.config_path is not None: json_config_file = json.load(open(args.config_path, 'r')) configuration_maps['module_to_stage_map'] = json_config_file.get( "module_to_stage_map", None) configuration_maps['stage_to_rank_map'] = json_config_file.get( "stage_to_rank_map", None) configuration_maps['stage_to_rank_map'] = { int(k): v for (k, v) in configuration_maps['stage_to_rank_map'].items() } configuration_maps['stage_to_depth_map'] = json_config_file.get( "stage_to_depth_map", None) r = runtime.StageRuntime( model=model, distributed_backend=args.distributed_backend, fp16=args.fp16, loss_scale=args.loss_scale, training_tensor_shapes=training_tensor_shapes, eval_tensor_shapes=eval_tensor_shapes, training_tensor_dtypes=dtypes, inputs_module_destinations=inputs_module_destinations, target_tensor_names=target_tensor_names, configuration_maps=configuration_maps, master_addr=args.master_addr, rank=args.rank, local_rank=args.local_rank, num_ranks_in_server=args.num_ranks_in_server, verbose_freq=args.verbose_frequency, model_type=runtime.IMAGE_CLASSIFICATION, port=args.port, enable_recompute=args.recompute) # stage needed to determine if current stage is the first stage # num_stages needed to determine if current stage is the last stage # num_ranks needed to determine number of warmup_minibatches in case of pipelining args.stage = r.stage args.num_stages = r.num_stages args.num_ranks = r.num_ranks if not is_first_stage(): args.synthetic_data = True # define optimizer if args.no_input_pipelining: num_versions = 1 else: # number of versions is the total number of machines following the current # stage, shared amongst all replicas in this stage num_versions = r.num_warmup_minibatches + 1 # if specified, resume from checkpoint if args.resume: checkpoint_file_path = "%s.%d.pth.tar" % (args.resume, r.stage) assert os.path.isfile(checkpoint_file_path) logging.info("=> loading checkpoint '{}'".format(checkpoint_file_path)) checkpoint = torch.load(checkpoint_file_path) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] r.load_state_dict(checkpoint['state_dict']) logging.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file_path, checkpoint['epoch'])) #optimizer = sgd.SGDWithWeightStashing(r.modules(), r.master_parameters, if args.spectrain: if args.log_dir != None: args.log_dir += '_spectrain' logging.info('Using spectrain') if args.square: if args.log_dir != None: args.log_dir += '_square' logging.info('s = version difference ^ 2') else: logging.info('s = version difference') optimizer = sgd.SGDWithSpectrain(r.modules(), r.master_parameters, r.model_parameters, args.loss_scale, num_versions=num_versions, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, verbose_freq=args.verbose_frequency, macrobatch=args.macrobatch) else: logging.info('Not using spectrain') optimizer = sgd.SGDWithWeightStashing( r.modules(), r.master_parameters, r.model_parameters, args.loss_scale, num_versions=num_versions, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, verbose_freq=args.verbose_frequency, macrobatch=args.macrobatch) logging.info(f'log_dir: {args.log_dir}') if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) cudnn.benchmark = True # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if args.arch == 'inception_v3': if args.synthetic_data: train_dataset = SyntheticDataset((3, 299, 299), 10000) else: traindir = os.path.join(args.data_dir, 'train') train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(299), transforms.ToTensor(), normalize, ])) else: if args.synthetic_data: train_dataset = SyntheticDataset((3, 224, 224), 50000) else: traindir = os.path.join(args.data_dir, 'train') transform_train = transforms.Compose([ #transforms.RandomResizedCrop(224), transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) #train_dataset = datasets.ImageFolder( train_dataset = datasets.CIFAR10( traindir, True, transform_train, #transforms.Compose([ # transforms.RandomResizedCrop(224), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize, #]), download=True) if args.synthetic_data: val_dataset = SyntheticDataset((3, 224, 224), 10000) else: valdir = os.path.join(args.data_dir, 'val') transform_test = transforms.Compose([ #transforms.RandomResizedCrop(224), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) #val_dataset = datasets.ImageFolder(valdir, transforms.Compose([ val_dataset = datasets.CIFAR10( valdir, False, transform_test, #transforms.Compose([ # transforms.Resize(256), # transforms.CenterCrop(224), # transforms.ToTensor(), # normalize, #]), download=True, ) global writer if dist.get_rank() == dist.get_world_size() - 1: writer = SummaryWriter(args.log_dir) distributed_sampler = False train_sampler = None val_sampler = None if configuration_maps['stage_to_rank_map'] is not None: num_ranks_in_first_stage = len( configuration_maps['stage_to_rank_map'][0]) if num_ranks_in_first_stage > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=num_ranks_in_first_stage, rank=args.rank) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=num_ranks_in_first_stage, rank=args.rank) distributed_sampler = True train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.eval_batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, drop_last=True) # if checkpoint is loaded, start by running validation if args.resume: assert args.start_epoch > 0 validate(val_loader, r, args.start_epoch - 1) for epoch in range(args.start_epoch, args.epochs): if distributed_sampler: train_sampler.set_epoch(epoch) # train or run forward pass only for one epoch if args.forward_only: validate(val_loader, r, epoch) else: train(train_loader, r, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, r, epoch) if r.stage != r.num_stages: prec1 = 0 # remember best prec@1 and save checkpoint best_prec1 = max(prec1, best_prec1) should_save_checkpoint = args.checkpoint_dir_not_nfs or r.rank_in_stage == 0 if args.checkpoint_dir and should_save_checkpoint: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': r.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, args.checkpoint_dir, r.stage)
def is_logging_process(): return not dist.is_initialized() or dist.get_rank() == 0
def validate(val_loader, r, epoch): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode n = r.num_iterations(loader_size=len(val_loader)) if args.num_minibatches is not None: n = min(n, args.num_minibatches) r.eval(n) if not is_first_stage(): val_loader = None r.set_loader(val_loader) end = time.time() epoch_start_time = time.time() if args.no_input_pipelining: num_warmup_minibatches = 0 else: num_warmup_minibatches = r.num_warmup_minibatches if args.verbose_frequency > 0: logging.info("Letting in %d warm-up minibatches" % num_warmup_minibatches) logging.info("Running validation for %d minibatches" % n) with torch.no_grad(): for i in range(num_warmup_minibatches): r.run_forward() for i in range(n - num_warmup_minibatches): # perform forward pass r.run_forward() r.run_ack() if is_last_stage(): output, target, loss = r.output, r.target, r.loss # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), output.size(0)) top1.update(prec1[0], output.size(0)) top5.update(prec5[0], output.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: logging.info( 'Test: [{0}][{1}/{2}]\t' 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Memory: {memory:.3f}G ({cached_memory:.3f}G)\t' 'Loss: {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1: {top1.val:.2f}% ({top1.avg:.2f}%)\t' 'Prec@5: {top5.val:.2f}% ({top5.avg:.2f}%)'.format( epoch, i, n, batch_time=batch_time, loss=losses, top1=top1, top5=top5, memory=(float(torch.cuda.memory_allocated()) / 10**9), cached_memory=(float(torch.cuda.memory_cached()) / 10**9))) import sys sys.stdout.flush() if is_last_stage(): logging.info( ' * Prec@1 {top1.avg:.2f}% Prec@5 {top5.avg:.2f}%'.format( top1=top1, top5=top5)) for i in range(num_warmup_minibatches): r.run_ack() # wait for all helper threads to complete r.wait() logging.info('Epoch %d: %.3f seconds' % (epoch, time.time() - epoch_start_time)) logging.info("Epoch start time: %.3f, epoch end time: %.3f" % (epoch_start_time, time.time())) global writer if dist.get_rank() == dist.get_world_size() - 1: writer.add_scalar('Test/Loss', losses.avg, epoch) writer.add_scalar('Test/Accuracy', top1.avg, epoch) return top1.avg
def get_rank(): if not is_dist_avail_and_initialized(): return 0 return dist.get_rank()
parser.add_argument('--min-num-tensors', dest='min_num_tensors', action='store', default=2, type=int, help='set the inclusive lower limit for the number of ' + 'tensors to be sent during one test run; ' + 'default: 2 (10**2 = 100)') args = parser.parse_args() MIN_NUM_TENSORS = args.min_num_tensors MIN_BYTES = args.min_bytes MAX_NUM_TENSORS = args.max_num_tensors + 1 MAX_BYTES = args.max_bytes + 1 dist.init_process_group(backend=os.environ['BACKEND']) rank = dist.get_rank() dist.barrier() if rank == 0: print_header("broadcast") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.broadcast(tensor, 0) end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
def __init__(self, train_data, model, optimizer=None, loss=None, callbacks_all=None, callbacks_master=None, batch_size_per_gpu=8, n_epochs=1, num_workers=1, drop_last=False, dev_data=None, metrics=None, metric_key=None, update_every=1, print_every=10, validate_every=-1, save_path=None, device='auto', fp16='', use_tqdm=True): r""" :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。 :param nn.modules model: 待训练的模型 :param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器 :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward` :param list callbacks_all: 用于在train过程中起调节作用的回调函数,作用于所有训练进程中。 可使用的callback参见 :mod:`callback模块 <fastNLP.core.callback>` :param list callbacks_master: 用于在train过程中起调节作用的回调函数,只作用于其中一个进程( Master 进程)。 可使用的callback参见 :mod:`callback模块 <fastNLP.core.callback>` :param int batch_size_per_gpu: 训练时,每个进程的 batch 大小。 :param int n_epochs: 需要优化迭代多少次。 :param num_workers: int, 有多少个线程来进行数据pad处理。 :param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch :param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。 :param metrics: 验证的评估函数。可以只使用一个 :class:`Metric<fastNLP.core.metrics.MetricBase>` , 也可以使用多个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ,通过列表传入。 如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None, 则保存当前模型。Metric种类详见 :mod:`metrics模块 <fastNLP.core.metrics>` 。仅在传入dev_data时有效。 :param str,None metric_key: :class:`Metric<fastNLP.core.metrics.MetricBase>` 有时会有多个指标, 比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需 要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表 明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。 :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128 会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。 :param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。 :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 :param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存 最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 :param str device: 指定 device,可以是 gpu,cpu 或 auto :param str fp16: 指定半精度训练的优化等级,可为 O1,O2 或 O3,若为空字符串则不使用半精度。 :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 """ assert device in [ 'auto', 'cuda', 'cpu' ], "Please set correct device in [auto', 'cuda', 'cpu']" if device == 'auto': device = 'cuda' if torch.cuda.is_available() else 'cpu' # init distributed if device == 'cuda': torch.cuda.set_device(get_local_rank()) self.device = torch.device("cuda", get_local_rank()) else: self.device = torch.device(device) init_logger_dist() self.world_size = dist.get_world_size() self.rank = dist.get_rank() # unique id for each process self.train_data = train_data self.batch_size_per_gpu = int(batch_size_per_gpu) self.n_epochs = int(n_epochs) self.num_data_workers = int(num_workers) self.drop_last = drop_last self.update_every = int(update_every) self.print_every = int(print_every) self.validate_every = int(validate_every) self.save_path = save_path self.losser = _prepare_losser(loss) self.fp16 = fp16 self.local_rank = get_local_rank() self._forward_func = model.forward self.callback_manager = DistCallbackManager( env={"trainer": self}, callbacks_all=callbacks_all, callbacks_master=callbacks_master) self.test_manager = DistCallbackManager(env={'trainer': self}) self.metric_key = metric_key self.use_tqdm = use_tqdm model.to(self.device) optimizer = self._get_optimizer(optimizer) # init fp16, must before DataParallel init if len(self.fp16): assert isinstance( self.fp16, str ), "Please set Apex AMP optimization level selected in ['O0', 'O1', 'O2', 'O3']" _check_fp16() assert device == 'cuda', "Amp requires cuda device" model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16) # init DataParallel if parse_version(torch.__version__) >= parse_version('1.1'): self.ddp_model = DDP(model, device_ids=[self.local_rank], output_device=self.local_rank, find_unused_parameters=True) else: self.ddp_model = DDP(model, device_ids=[self.local_rank], output_device=self.local_rank) self.model = self.ddp_model.module self.optimizer = optimizer self.sampler = DistributedSampler(self.train_data) self.data_iterator = self._get_data_iter(self.train_data) self.batch_size = self.world_size * self.batch_size_per_gpu self.n_steps = self._get_n_steps() # for evaluation, only run eval on master proc if dev_data and metrics: cb = _TesterCallback(dev_data, model, metrics, batch_size=batch_size_per_gpu, num_workers=num_workers) self.test_manager.add_callback([cb], master=True) # Setup logging dist.barrier() self.start_time = datetime.now().strftime('%m_%d_%Y-%H_%M') if self.save_path: self.cp_save_path = self.save_path else: self.cp_save_path = None # use INFO in the master, WARN for others self.logger = logger self.logger.info("Setup Distributed Trainer") self.logger.warning( "Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}". format(os.getpid(), self.rank, self.local_rank, self.device, self.fp16 if self.fp16 else False)) self.logger.info("Num of processes: {}".format(self.world_size)) self.logger.info("Use device: {}".format(device)) self.logger.info( "Training with fp16: {}, optimization level: {}".format( len(self.fp16) > 0, self.fp16 if self.fp16 else None))
def debug(content, who='all'): if who == 'all' or who == dist.get_rank(): logger.debug(content)
def train(model_config, model, benchmark_config, model_specs, args): lm_dataloader, _, _ = model_config["data"] criterion = benchmark_config["criterion"] vocab_size = model_specs["vocab_size"] optimizer = model_config["optimizer"] model.train() log_number_of_parameters(model) total_loss = 0.0 word_counter = 0 optimizer = optimizer(model.parameters()) pipe_group = model.group if hasattr(model, "group") else None if args.ddp_zero: model = DDP( model, device_ids=[torch.cuda.current_device()], process_group=get_data_parallel_group(), find_unused_parameters=False, ) # TODO(anj-s): Avoid sending fake data to all replicas except the first and last one. if pipe_group and pipe_group.rank() != 0 and pipe_group.rank() != (pipe_group.size() - 1): lm_dataloader, _, _ = get_synthetic_dataloaders(args, benchmark_config, model_specs) total_tokens = 0 total_tokens_per_log_interval = 0 bptt = 2 start_time = time.time() epoch_start_time = 0.0 def get_batch(source): seq_len = len(source) - 1 data = source[0:seq_len] target = source[1 : 1 + seq_len] return data, target for i, batch in enumerate(lm_dataloader): if i == 1: epoch_start_time = time.time() source, target = get_batch(batch) if args.max_batch and i > args.max_batch: break if i > 0: total_tokens += source.numel() optimizer.zero_grad() try: if (pipe_group is None or pipe_group.rank() == 0) and not args.ddp_zero: tmp = source.to(get_device(model, 0)) output = model(tmp) else: output = model(source) except Exception as e: raise RuntimeError(f"training failed on {torch.distributed.get_rank()}") from e if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: target = target.to(get_device(model, -1)) output = output.to(target.device) loss = criterion(output.view(-1, vocab_size), target.view(-1)) if args.ddp_zero: ddp_group = get_data_parallel_group() torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.SUM, group=ddp_group) loss /= ddp_group.size() loss.backward() del target else: if args.ddp_zero: model.module.back_helper(output) else: model.back_helper(output) del output torch.nn.utils.clip_grad_value_(model.parameters(), model_specs["clip_value"]) optimizer.step() if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1: total_loss += loss.item() log_interval = 1 total_tokens_per_log_interval += source.numel() if i % log_interval == 0 and i > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time if not args.multiprocess or dist.get_rank() == dist.get_world_size() - 1: print( "| batch {:5d} | wps {:5.2f} | loss {:5.2f} | ppl {:8.2f}".format( i, total_tokens_per_log_interval / elapsed, cur_loss, math.exp(cur_loss) ) ) total_tokens_per_log_interval = 0 total_loss = 0 start_time = time.time() if epoch_start_time != 0: wps = total_tokens / (time.time() - epoch_start_time) else: raise RuntimeError( "Unable to benchmark on a single batch. Increase the size " " of the dataset and rerun the benchmark." ) if not args.multiprocess or dist.get_rank() == dist.get_world_size() - 1: return wps, loss.item() else: return 0.0, 0.0
def run(rank, size): torch.manual_seed(1234) test_set, bsz = partition_dataset() model = load_model(nn.parallel.DistributedDataParallel(Net()), "sgd_150_0.1_state_Dict_150.pth").float() num_batches = np.ceil(len(test_set.dataset) / float(bsz)) best_loss = float("inf") preds, labels = get_all_preds(model, test_set) #print("Preds Size") #print(preds.size()) ([7551,15]) #print("Labels Size") ([7551]) #print(labels.size()) pred_lbl_fl = preds.argmax(1).float() lbl_fl = labels.float() prediction_list = [torch.zeros_like(pred_lbl_fl) for _ in range(size)] labels_list = [torch.zeros_like(pred_lbl_fl) for _ in range(size)] #print(labels) if dist.get_rank() == 0: gather(pred_lbl_fl, prediction_list) gather(lbl_fl, labels_list) else: gather(pred_lbl_fl) gather(lbl_fl) if dist.get_rank() == 0: new_preds = torch.tensor([], dtype=torch.float32) new_labels = torch.tensor([], dtype=torch.float32) for t1 in prediction_list: new_preds = torch.cat((new_preds, t1), dim=0) for t2 in labels_list: new_labels = torch.cat((new_labels, t2), dim=0) print("Preds:") k = new_preds.tolist() print(k[0:20]) print("Actual:") j = new_labels.tolist() print(j[0:20]) accuracry = calculate_accuracy(new_labels, new_preds) print("Accuracy : ", accuracry) print("Classification Report") print( classification_report(new_labels, new_preds, target_names=class_names)) #roc_auc = roc_auc_compute_fn(new_preds, new_labels) #print("ROC-AUC score :", roc_auc) cm = get_confusion_matrix(new_labels, new_preds) print("Confusion Matrix :") print(cm)
def train(): logger = logging.getLogger() is_dist = dist.is_initialized() ## dataset dl = get_data_loader(cfg, mode='train', distributed=is_dist) ## model net, criteria_pre, criteria_aux = set_model() ## optimizer optim = set_optimizer(net) ## mixed precision training scaler = amp.GradScaler() ## ddp training net = set_model_dist(net) ## meters time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters() ## lr scheduler lr_schdr = WarmupPolyLrScheduler(optim, power=0.9, max_iter=cfg.max_iter, warmup_iter=cfg.warmup_iters, warmup_ratio=0.1, warmup='exp', last_epoch=-1,) ## train loop for it, (im, lb) in enumerate(dl): im = im.cuda() lb = lb.cuda() lb = torch.squeeze(lb, 1) optim.zero_grad() with amp.autocast(enabled=cfg.use_fp16): logits, *logits_aux = net(im) loss_pre = criteria_pre(logits, lb) loss_aux = [crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)] loss = loss_pre + sum(loss_aux) scaler.scale(loss).backward() scaler.step(optim) scaler.update() torch.cuda.synchronize() time_meter.update() loss_meter.update(loss.item()) loss_pre_meter.update(loss_pre.item()) _ = [mter.update(lss.item()) for mter, lss in zip(loss_aux_meters, loss_aux)] ## print training log message if (it + 1) % 100 == 0: lr = lr_schdr.get_lr() lr = sum(lr) / len(lr) print_log_msg( it, cfg.max_iter, lr, time_meter, loss_meter, loss_pre_meter, loss_aux_meters) lr_schdr.step() ## dump the final model and evaluate the result save_pth = osp.join(cfg.respth, 'model_final.pth') logger.info('\nsave models to {}'.format(save_pth)) state = net.module.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('\nevaluating the final model') torch.cuda.empty_cache() heads, mious = eval_model(cfg, net) logger.info(tabulate([mious, ], headers=heads, tablefmt='orgtbl')) return
def save_layers_on_all_rank_zero_workers(ctx, model): gpus_per_model = ctx["gpus_per_model"] rank = torch_distrib.get_rank() if rank in range(gpus_per_model): seq = list(model.children())[0] torch.save(seq, f"seq_{rank}.pt")
def __init__(self, in_channels, bottleneck_channels, out_channels, stride=1, groups=1, dilation=1, norm_func=None, use_cudnn=False, explicit_nhwc=False, spatial_group_size=1, communicator=None): super(SpatialBottleneck, self).__init__() if groups != 1: raise RuntimeError('Only support groups == 1') if dilation != 1: raise RuntimeError('Only support dilation == 1') if norm_func == None: norm_func = FrozenBatchNorm2d else: raise RuntimeError('Only support frozen BN now.') if stride != 1 or in_channels != out_channels: self.downsample = nn.Sequential( conv1x1(in_channels, out_channels, stride), norm_func(out_channels), ) else: self.downsample = None # Both self.conv2 and self.downsample layers downsample the input when stride != 1 self.conv1 = conv1x1(in_channels, bottleneck_channels, stride) self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels) self.conv3 = conv1x1(bottleneck_channels, out_channels) self.relu = nn.ReLU(inplace=True) self.stride = stride self.bn1 = norm_func(bottleneck_channels) self.bn2 = norm_func(bottleneck_channels) self.bn3 = norm_func(out_channels) self.use_cudnn = use_cudnn # setup conv weights self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight] if self.downsample is not None: self.w_conv.append(self.downsample[0].weight) # init weight in nchw format before possible transpose for w in self.w_conv: kaiming_uniform_(w, a=1) # TODO: prevent unsupported case usage # support cases # native cudnn # normal yes no # channel_last yes yes # explicit_nhwc no yes self.explicit_nhwc = explicit_nhwc if self.explicit_nhwc: for p in self.parameters(): with torch.no_grad(): p.data = p.data.permute(0, 2, 3, 1).contiguous() # spatial communicator self.spatial_group_size = spatial_group_size if spatial_group_size > 1: world_size = dist.get_world_size() num_groups = world_size // spatial_group_size assert ( num_groups * spatial_group_size == world_size ), "torch.distributed.get_world_size() must be multiple of group_size" rank = dist.get_rank() self.local_rank = rank % spatial_group_size if communicator is None: for group in range(num_groups): ranks = list( range(group * spatial_group_size, (group + 1) * spatial_group_size)) comm = torch.distributed.new_group(ranks=ranks) if rank in ranks: self.communicator = comm else: self.communicator = communicator self.stream1 = torch.cuda.Stream() self.spatial_args = self.spatial_group_size, self.local_rank, self.communicator, self.stream1 else: self.spatial_args = 1, 0, None, None return
def get_rank(): return dist.get_rank()
def set_main_rpc_process(self): self.main_rpc_process = torch_distrib.get_rank( group=mpu.get_pipeline_parallel_group()) == 0
def train(args): is_distributed = len(args.hosts) > 1 and args.backend is not None logger.debug("Distributed training - {}".format(is_distributed)) use_cuda = args.num_gpus > 0 logger.debug("Number of gpus available - {}".format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device("cuda" if use_cuda else "cpu") if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) # set the seed for generating random numbers torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) logger.debug("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset) )) logger.debug("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset) )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case model = torch.nn.parallel.DistributedDataParallel(model) else: # single-machine multi-gpu case or single-machine or multi-machine cpu case model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % args.log_interval == 0: logger.info('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) test(model, test_loader, device) save_model(model, args.model_dir)
def get_rank(): if not dist.is_available(): return 0 if not dist.is_initialized(): return 0 return dist.get_rank()
def validate(val_loader, model, criterion, epoch, start_time, log_writer): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() global iter_ptr model.eval() end = time.time() prefetcher = data_prefetcher(val_loader) input, target = prefetcher.next() i = -1 while input is not None: i += 1 target = target.cuda(async=True) input_var = Variable(input) target_var = Variable(target) # compute output with torch.no_grad(): output = model(input_var) loss = criterion(output, target_var) reduced_loss = reduce_tensor(loss.data) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) reduced_prec1 = reduce_tensor(prec1) reduced_prec5 = reduce_tensor(prec5) losses.update(to_python_float(reduced_loss), input.size(0)) top1.update(to_python_float(prec1), input.size(0)) top5.update(to_python_float(prec5), input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if dist.get_rank() == 0 and i % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5)) input, target = prefetcher.next() time_diff = datetime.now()-start_time if dist.get_rank() == 0: print(f'~~{epoch}\t{float(time_diff.total_seconds() / 3600.0)}\t{top5.avg:.3f}\n') print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) if log_writer: log_writer.add_scalar('test_iter/top1', top1.get_avg(), iter_ptr) log_writer.add_scalar('test_iter/top5', top5.get_avg(), iter_ptr) log_writer.add_scalar('test_iter/loss', losses.get_avg(), iter_ptr) log_writer.add_scalar('test_iter/batch_time', batch_time.get_avg(), iter_ptr) log_writer.add_scalar('test_epoch/top1', top1.get_avg(), epoch) log_writer.add_scalar('test_epoch/top5', top5.get_avg(), epoch) log_writer.add_scalar('test_epoch/loss', losses.get_avg(), epoch) log_writer.add_scalar('test_time/top1', top1.get_avg(), train_record.get_time()) log_writer.add_scalar('test_time/top5', top5.get_avg(), train_record.get_time()) log_writer.add_scalar('test_time/loss', losses.get_avg(), train_record.get_time()) return top1.avg
logger.info(f'E{epoch} V{v} * msIoU {IoUs}') if overall_acc: logger.info(f'E{epoch} V{v} * OA {overall_acc:.4%}') return mIoU if __name__ == "__main__": args, config = parse_config() torch.cuda.set_device(config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True os.makedirs(args.log_dir, exist_ok=True) os.environ["JOB_LOAD_DIR"] = os.path.dirname(config.load_path) logger = setup_logger(output=config.log_dir, distributed_rank=dist.get_rank(), name="s3dis_eval") if dist.get_rank() == 0: path = os.path.join(config.log_dir, "config.json") with open(path, 'w') as f: json.dump(vars(args), f, indent=2) json.dump(vars(config), f, indent=2) os.system('cp %s %s' % (args.cfg, config.log_dir)) logger.info("Full config saved to {}".format(path)) # main function main(config)
def _train(args): is_distributed = len(args.hosts) > 1 and args.dist_backend is not None logger.debug("Distributed training - {}".format(is_distributed)) if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=args.dist_backend, rank=host_rank, world_size=world_size) logger.info( 'Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( args.dist_backend, dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format( dist.get_rank(), torch.cuda.is_available(), args.num_gpus)) device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info("Device Type: {}".format(device)) logger.info("Loading Cifar10 dataset") transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) trainset = torchvision.datasets.CIFAR10(root=args.data_dir, train=True, download=False, transform=transform) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) testset = torchvision.datasets.CIFAR10(root=args.data_dir, train=False, download=False, transform=transform) test_loader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) logger.info("Model loaded") model = Net() if torch.cuda.device_count() > 1: logger.info("Gpu count: {}".format(torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(0, args.epochs): running_loss = 0.0 for i, data in enumerate(train_loader): # get the inputs inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 print('Finished Training') return _save_model(model, args.model_dir)
def test_synchronize_sgd(): torch.manual_seed(42) dist.init_process_group('mpi') rank = dist.get_rank() world_size = dist.get_world_size() device = torch.device('cpu') # device = torch.device('cuda') # Uncomment this to run on GPU # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold input and outputs x = torch.randn(N, D_in, device=device) y = torch.randn(N, D_out, device=device) x = x[rank::world_size] y = y[rank::world_size] # Create random Tensors for weights; setting requires_grad=True means that we # want to compute gradients for these Tensors during the backward pass. w1 = torch.randn(D_in, H, device=device, requires_grad=True) w2 = torch.randn(H, D_out, device=device, requires_grad=True) learning_rate = 1e-6 for t in range(500): # Forward pass: compute predicted y using operations on Tensors. Since w1 and # w2 have requires_grad=True, operations involving these Tensors will cause # PyTorch to build a computational graph, allowing automatic computation of # gradients. Since we are no longer implementing the backward pass by hand we # don't need to keep references to intermediate values. y_pred = x.mm(w1).clamp(min=0).mm(w2) # Compute and print loss. Loss is a Tensor of shape (), and loss.item() # is a Python number giving its value. loss = (y_pred - y).pow(2).sum() if rank == 0: print("Iter {} : {:10.3e}".format(t, loss.item())) # Use autograd to compute the backward pass. This call will compute the # gradient of loss with respect to all Tensors with requires_grad=True. # After this call w1.grad and w2.grad will be Tensors holding the gradient # of the loss with respect to w1 and w2 respectively. loss.backward() # Update weights using gradient descent. For this step we just want to mutate # the values of w1 and w2 in-place; we don't want to build up a computational # graph for the update steps, so we use the torch.no_grad() context manager # to prevent PyTorch from building a computational graph for the updates with torch.no_grad(): w1 -= learning_rate * w1.grad w2 -= learning_rate * w2.grad # Manually zero the gradients after running the backward pass w1.grad.zero_() w2.grad.zero_() # Synchronize weights dist.all_reduce(w1, op=dist.reduce_op.SUM) dist.all_reduce(w2, op=dist.reduce_op.SUM) w1 /= world_size w2 /= world_size
def test_basic_math_ops(self): ops = [ "torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*", "/" ] spec = ChunkShardingSpec( dim=0, placements=[ "rank:0/cuda:0", "rank:1/cuda:1", "rank:2/cuda:2", "rank:3/cuda:3", ], ) sharded_lhs = sharded_tensor.rand(spec, (12, 3)) sharded_rhs = sharded_tensor.rand(spec, (12, 3)) current_rank = dist.get_rank() global_lhs = torch.empty( (12, 3), device=current_rank) if current_rank == 0 else None global_rhs = torch.empty( (12, 3), device=current_rank) if current_rank == 0 else None sharded_lhs.gather(dst=0, out=global_lhs) sharded_rhs.gather(dst=0, out=global_rhs) for op in ops: binary_op = gen_binary_op_func(op) binary_op_ = gen_binary_op_func(op, inplace=True) # test basic math ops between ShardedTensors sharded_output = binary_op(sharded_lhs, sharded_rhs) output = torch.empty( (12, 3), device=current_rank) if current_rank == 0 else None sharded_output.gather(dst=0, out=output) if current_rank == 0: global_output = binary_op(global_lhs, global_rhs) self.assertEqual(output, global_output) # test basic math ops between ShardedTensor and scalar scalars = [3, 1.8] for scalar in scalars: sharded_output_lhs = binary_op(sharded_lhs, scalar) sharded_output_lhs_ = binary_op_(sharded_lhs, scalar) self.assertTrue( torch.allclose(sharded_output_lhs, sharded_output_lhs_)) output_lhs = torch.empty( (12, 3), device=current_rank) if current_rank == 0 else None sharded_output_lhs.gather(dst=0, out=output_lhs) sharded_output_rhs = binary_op(scalar, sharded_lhs) output_rhs = torch.empty( (12, 3), device=current_rank) if current_rank == 0 else None sharded_output_rhs.gather(dst=0, out=output_rhs) if current_rank == 0: global_output_lhs = binary_op(global_lhs, scalar) global_output_rhs = binary_op(scalar, global_lhs) self.assertEqual(output_lhs, global_output_lhs) self.assertEqual(output_rhs, global_output_rhs)
def main(): args = create_argparser().parse_args() dist_util.setup_dist() logger.configure() logger.log("creating model and diffusion...") model, diffusion = create_model_and_diffusion( **args_to_dict(args, model_and_diffusion_defaults().keys())) model.load_state_dict( dist_util.load_state_dict(args.model_path, map_location="cpu")) model.to(dist_util.dev()) if args.use_fp16: model.convert_to_fp16() model.eval() logger.log("loading classifier...") classifier = create_classifier( **args_to_dict(args, classifier_defaults().keys())) classifier.load_state_dict( dist_util.load_state_dict(args.classifier_path, map_location="cpu")) classifier.to(dist_util.dev()) if args.classifier_use_fp16: classifier.convert_to_fp16() classifier.eval() def cond_fn(x, t, y=None): assert y is not None with th.enable_grad(): x_in = x.detach().requires_grad_(True) logits = classifier(x_in, t) log_probs = F.log_softmax(logits, dim=-1) selected = log_probs[range(len(logits)), y.view(-1)] return th.autograd.grad(selected.sum(), x_in)[0] * args.classifier_scale def model_fn(x, t, y=None): assert y is not None return model(x, t, y if args.class_cond else None) logger.log("sampling...") all_images = [] all_labels = [] while len(all_images) * args.batch_size < args.num_samples: model_kwargs = {} classes = th.randint(low=0, high=NUM_CLASSES, size=(args.batch_size, ), device=dist_util.dev()) model_kwargs["y"] = classes sample_fn = (diffusion.p_sample_loop if not args.use_ddim else diffusion.ddim_sample_loop) sample = sample_fn( model_fn, (args.batch_size, 3, args.image_size, args.image_size), clip_denoised=args.clip_denoised, model_kwargs=model_kwargs, cond_fn=cond_fn, device=dist_util.dev(), ) sample = ((sample + 1) * 127.5).clamp(0, 255).to(th.uint8) sample = sample.permute(0, 2, 3, 1) sample = sample.contiguous() gathered_samples = [ th.zeros_like(sample) for _ in range(dist.get_world_size()) ] dist.all_gather(gathered_samples, sample) # gather not supported with NCCL all_images.extend( [sample.cpu().numpy() for sample in gathered_samples]) gathered_labels = [ th.zeros_like(classes) for _ in range(dist.get_world_size()) ] dist.all_gather(gathered_labels, classes) all_labels.extend([labels.cpu().numpy() for labels in gathered_labels]) logger.log(f"created {len(all_images) * args.batch_size} samples") arr = np.concatenate(all_images, axis=0) arr = arr[:args.num_samples] label_arr = np.concatenate(all_labels, axis=0) label_arr = label_arr[:args.num_samples] if dist.get_rank() == 0: shape_str = "x".join([str(x) for x in arr.shape]) out_path = os.path.join(logger.get_dir(), f"samples_{shape_str}.npz") logger.log(f"saving to {out_path}") np.savez(out_path, arr, label_arr) dist.barrier() logger.log("sampling complete")