def train_worker(dataset, device, rank=0, world_size=None): torch.cuda.set_device(device) criterion = TripletMarginRankingLoss(args.loss_margin) model = TransformerPool(args.vocab_size, args.embedding_dim, args.hidden_dim, pre_trained=GLOVE) if args.re_train: model.load_state_dict(torch.load( args.train_model, map_location='cuda:{}'.format(device))) else: model.apply(init_weights) model, criterion = model.to(device), criterion.to(device) triplet_dataset = TripletDataset(dataset) in_distributed_mode = True if world_size else False if in_distributed_mode: rank, device = torch.distributed.get_rank(), torch.cuda.current_device() print("rank:{}, device:{}".format(rank, device)) if in_distributed_mode: model = DistributedDataParallel( model, device_ids=[device]) datasampler = DistributedSampler(triplet_dataset) dataloader = DataLoader(triplet_dataset, shuffle=False, pin_memory=True, num_workers=0, batch_size=args.batch_size, sampler=datasampler) else: dataloader = DataLoader(triplet_dataset, shuffle=True, pin_memory=True, num_workers=4, batch_size=args.batch_size) optimizer = RAdam( model.parameters(), lr=args.learning_rate) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.t_max, eta_min=args.eta_min) model.train() best_avg_loss = None t1 = time.time() for epoch in range(args.epoch): datasampler.set_epoch(epoch) if in_distributed_mode else None total_loss = [] bar = tqdm(desc='EPOCH {:02d}'.format(epoch), total=len( dataloader), leave=False) if rank == 0 else None for triplet in dataloader: optimizer.zero_grad() anchor, positive, negative = model(triplet) loss = criterion(anchor, positive, negative) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() total_loss.append(loss.item()) bar.update() if rank == 0 else None if rank == 0: bar.close() epoch_avg_loss = np.mean(total_loss) scheduler.step(epoch_avg_loss) print("Epoch {:02d}, Time {:.02f}s, AvgLoss {:.08f}, lr {:.8f}".format( epoch, time.time()-t1, epoch_avg_loss, optimizer.param_groups[0]['lr'])) if best_avg_loss is None or epoch_avg_loss < best_avg_loss: best_avg_loss = epoch_avg_loss state_dict = model.module.state_dict() if in_distributed_mode else model.state_dict() torch.save(state_dict, args.model_path) t1 = time.time() scheduler.step(epoch) torch.cuda.empty_cache() return
def run(proc_id, n_gpus, args, devices, dataset): dev_id = devices[proc_id] train_labels = dataset.train_labels train_truths = dataset.train_truths num_edges = train_truths.shape[0] reverse_types = { to_etype_name(k): 'rev-' + to_etype_name(k) for k in dataset.possible_rating_values } reverse_types.update({v: k for k, v in reverse_types.items()}) sampler = dgl.dataloading.MultiLayerNeighborSampler([None], return_eids=True) dataloader = dgl.dataloading.EdgeDataLoader(dataset.train_enc_graph, { to_etype_name(k): th.arange( dataset.train_enc_graph.number_of_edges(etype=to_etype_name(k))) for k in dataset.possible_rating_values }, sampler, batch_size=args.minibatch_size, shuffle=True, drop_last=False) if proc_id == 0: valid_dataloader = dgl.dataloading.EdgeDataLoader( dataset.valid_dec_graph, th.arange(dataset.valid_dec_graph.number_of_edges()), sampler, g_sampling=dataset.valid_enc_graph, batch_size=args.minibatch_size, shuffle=False, drop_last=False) test_dataloader = dgl.dataloading.EdgeDataLoader( dataset.test_dec_graph, th.arange(dataset.test_dec_graph.number_of_edges()), sampler, g_sampling=dataset.test_enc_graph, batch_size=args.minibatch_size, shuffle=False, drop_last=False) if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus th.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=dev_id) if n_gpus > 0: th.cuda.set_device(dev_id) nd_possible_rating_values = \ th.FloatTensor(dataset.possible_rating_values) nd_possible_rating_values = nd_possible_rating_values.to(dev_id) start = time.time() net = Net(args=args, dev_id=dev_id) net = net.to(dev_id) if n_gpus > 1: net = DistributedDataParallel(net, device_ids=[dev_id], output_device=dev_id) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_epoch = -1 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") dur = [] iter_idx = 1 logging_str = None ### prepare the logger train_loss_logger = MetricLogger( ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'test_loss%d.csv' % args.save_id)) for epoch in range(1, args.train_max_epoch): if epoch == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print( torch_net_info(net, save_path=os.path.join( args.save_dir, 'net%d.txt' % args.save_id))) if epoch > 1: t0 = time.time() net.train() with tqdm.tqdm(dataloader) as tq: for step, (input_nodes, pair_graph, blocks) in enumerate(tq): head_feat, tail_feat, blocks = load_subtensor( input_nodes, pair_graph, blocks, dataset, dataset.train_enc_graph) frontier = blocks[0] compact_g = flatten_etypes(pair_graph, dataset, 'train').to(dev_id) true_relation_labels = compact_g.edata['label'] true_relation_ratings = compact_g.edata['rating'] head_feat = head_feat.to(dev_id) tail_feat = tail_feat.to(dev_id) frontier = frontier.to(dev_id) pred_ratings = net(compact_g, frontier, head_feat, tail_feat, dataset.possible_rating_values) loss = rating_loss_net(pred_ratings, true_relation_labels.to(dev_id)).mean() count_loss += loss.item() optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() if proc_id == 0 and iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) real_pred_ratings = ( th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) rmse = ((real_pred_ratings - true_relation_ratings.to(dev_id))**2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) tq.set_postfix( { 'loss': '{:.4f}'.format(count_loss / iter_idx), 'rmse': '{:.4f}'.format(count_rmse / count_num) }, refresh=False) iter_idx += 1 if epoch > 1: epoch_time = time.time() - t0 print("Epoch {} time {}".format(epoch, epoch_time)) if epoch % args.train_valid_interval == 0: if n_gpus > 1: th.distributed.barrier() if proc_id == 0: valid_rmse = evaluate(args=args, dev_id=dev_id, net=net, dataset=dataset, dataloader=valid_dataloader, segment='valid') valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str = 'Val RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_epoch = epoch test_rmse = evaluate(args=args, dev_id=dev_id, net=net, dataset=dataset, dataloader=test_dataloader, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max( learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: logging.info("\tChange the LR to %g" % new_lr) learning_rate = new_lr for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 print("Change the LR to %g" % new_lr) # sync on evalution if n_gpus > 1: th.distributed.barrier() if logging_str is not None: print(logging_str) if proc_id == 0: print( 'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'. format(best_epoch, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close() with open( os.path.join(args.save_dir, f'duration_{args.save_id:d}.txt'), 'a') as f: print(f'wall: {time.time() - start}') f.write(f'wall: {time.time() - start}')
def run(args, device, data): g, node_feats, num_of_ntype, num_classes, num_rels, \ train_nid, val_nid, test_nid, labels, global_val_nid, global_test_nid = data fanouts = [int(fanout) for fanout in args.fanout.split(',')] val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(',')] sampler = NeighborSampler(g, fanouts, dgl.distributed.sample_neighbors) # Create DataLoader for constructing blocks dataloader = DistDataLoader(dataset=train_nid.numpy(), batch_size=args.batch_size, collate_fn=sampler.sample_blocks, shuffle=True, drop_last=False) valid_sampler = NeighborSampler(g, val_fanouts, dgl.distributed.sample_neighbors) # Create DataLoader for constructing blocks valid_dataloader = DistDataLoader(dataset=val_nid.numpy(), batch_size=args.batch_size, collate_fn=valid_sampler.sample_blocks, shuffle=False, drop_last=False) test_sampler = NeighborSampler(g, [-1] * args.n_layers, dgl.distributed.sample_neighbors) # Create DataLoader for constructing blocks test_dataloader = DistDataLoader(dataset=test_nid.numpy(), batch_size=args.batch_size, collate_fn=test_sampler.sample_blocks, shuffle=False, drop_last=False) embed_layer = DistEmbedLayer(device, g, num_of_ntype, args.n_hidden, sparse_emb=args.sparse_embedding, dgl_sparse_emb=args.dgl_sparse) model = EntityClassify(device, args.n_hidden, num_classes, num_rels, num_bases=args.n_bases, num_hidden_layers=args.n_layers - 2, dropout=args.dropout, use_self_loop=args.use_self_loop, low_mem=args.low_mem, layer_norm=args.layer_norm) model = model.to(device) if not args.standalone: model = th.nn.parallel.DistributedDataParallel(model) if args.sparse_embedding and not args.dgl_sparse: embed_layer = DistributedDataParallel(embed_layer, device_ids=None, output_device=None) if args.sparse_embedding: if args.dgl_sparse: emb_optimizer = dgl.distributed.SparseAdagrad( [embed_layer.node_embeds], lr=args.sparse_lr) else: emb_optimizer = th.optim.SparseAdam( embed_layer.module.node_embeds.parameters(), lr=args.sparse_lr) optimizer = th.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2norm) else: all_params = list(model.parameters()) + list(embed_layer.parameters()) optimizer = th.optim.Adam(all_params, lr=args.lr, weight_decay=args.l2norm) # training loop print("start training...") for epoch in range(args.n_epochs): tic = time.time() sample_time = 0 copy_time = 0 forward_time = 0 backward_time = 0 update_time = 0 number_train = 0 step_time = [] iter_t = [] sample_t = [] feat_copy_t = [] forward_t = [] backward_t = [] update_t = [] iter_tput = [] start = time.time() # Loop over the dataloader to sample the computation dependency graph as a list of # blocks. step_time = [] for step, sample_data in enumerate(dataloader): seeds, blocks = sample_data number_train += seeds.shape[0] tic_step = time.time() sample_time += tic_step - start sample_t.append(tic_step - start) feats = embed_layer(blocks[0].srcdata[dgl.NID], blocks[0].srcdata[dgl.NTYPE], node_feats) label = labels[seeds] copy_time = time.time() feat_copy_t.append(copy_time - tic_step) # forward logits = model(blocks, feats) loss = F.cross_entropy(logits, label) forward_end = time.time() # backward optimizer.zero_grad() if args.sparse_embedding and not args.dgl_sparse: emb_optimizer.zero_grad() loss.backward() optimizer.step() if args.sparse_embedding: emb_optimizer.step() compute_end = time.time() forward_t.append(forward_end - copy_time) backward_t.append(compute_end - forward_end) # Aggregate gradients in multiple nodes. optimizer.step() update_t.append(time.time() - compute_end) step_t = time.time() - start step_time.append(step_t) if step % args.log_every == 0: print('[{}] Epoch {:05d} | Step {:05d} | Loss {:.4f} | time {:.3f} s' \ '| sample {:.3f} | copy {:.3f} | forward {:.3f} | backward {:.3f} | update {:.3f}'.format( g.rank(), epoch, step, loss.item(), np.sum(step_time[-args.log_every:]), np.sum(sample_t[-args.log_every:]), np.sum(feat_copy_t[-args.log_every:]), np.sum(forward_t[-args.log_every:]), np.sum(backward_t[-args.log_every:]), np.sum(update_t[-args.log_every:]))) start = time.time() print( '[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #number_train: {}' .format(g.rank(), np.sum(step_time), np.sum(sample_t), np.sum(feat_copy_t), np.sum(forward_t), np.sum(backward_t), np.sum(update_t), number_train)) epoch += 1 start = time.time() g.barrier() val_acc, test_acc = evaluate(g, model, embed_layer, labels, valid_dataloader, test_dataloader, node_feats, global_val_nid, global_test_nid) if val_acc >= 0: print('Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format( val_acc, test_acc, time.time() - start))
def __init__(self, cfg, model_build_func): """ Args: cfg (BaseConfig): """ logger = logging.getLogger("cvpods") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() self.start_iter = 0 data_loader = self.build_train_loader(cfg) maybe_adjust_epoch_and_iter(cfg, data_loader) self.max_iter = cfg.SOLVER.LR_SCHEDULER.MAX_ITER self.max_epoch = cfg.SOLVER.LR_SCHEDULER.MAX_EPOCH model = model_build_func(cfg) model = maybe_convert_module(model) logger.info(f"Model structure: {model}") # Assume these objects must be constructed in this order. optimizer = self.build_optimizer(cfg, model) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: if cfg.TRAINER.FP16.ENABLED: if cfg.TRAINER.FP16.TYPE == "APEX": model, optimizer = amp.initialize( model, optimizer, opt_level=cfg.TRAINER.FP16.OPTS.OPT_LEVEL) model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=True) # TODO: @wangfeng02, `batch_subdivisions` super().__init__(model, data_loader, optimizer, cfg.SOLVER.BATCH_SUBDIVISIONS) if not cfg.SOLVER.LR_SCHEDULER.get("EPOCH_WISE", False): epoch_iters = -1 else: epoch_iters = cfg.SOLVER.LR_SCHEDULER.get("EPOCH_ITERS") logger.warning(f"Setup LR Scheduler in EPOCH mode: {epoch_iters}") self.scheduler = self.build_lr_scheduler(cfg, optimizer, epoch_iters=epoch_iters) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks optional = {} if cfg.TRAINER.FP16.ENABLED: optional["amp"] = amp self.checkpointer = DefaultCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, **optional, ) self.cfg = cfg self.register_hooks(self.build_hooks())
def main_worker(gpu, ngpus_per_node, args, cfg): args.gpu = gpu args.rank = args.rank * ngpus_per_node + gpu logger.info(f'rank: {args.rank} / {args.world_size}') dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.cuda.set_device(args.gpu) if args.gpu == 0: mkdir(args.outdir) filename = os.path.join(args.outdir, 'log.txt') fh = logging.StreamHandler(cached_log_stream(filename)) fh.setLevel(logging.INFO) logger.addHandler(fh) plain_formatter = logging.Formatter( "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S") fh.setFormatter(plain_formatter) logger.info(args) # init the global global best_pred, acclist_train, acclist_val # seed torch.manual_seed(cfg.SEED) torch.cuda.manual_seed(cfg.SEED) # init dataloader transform_train, transform_val = get_transform(cfg.DATA.DATASET)( cfg.DATA.BASE_SIZE, cfg.DATA.CROP_SIZE, cfg.DATA.RAND_AUG) trainset = get_dataset(cfg.DATA.DATASET)(root=cfg.DATA.ROOT, transform=transform_train, train=True, download=True) valset = get_dataset(cfg.DATA.DATASET)(root=cfg.DATA.ROOT, transform=transform_val, train=False, download=True) train_sampler = torch.utils.data.distributed.DistributedSampler(trainset) train_loader = torch.utils.data.DataLoader( trainset, batch_size=cfg.TRAINING.BATCH_SIZE, shuffle=False, num_workers=cfg.TRAINING.WORKERS, pin_memory=True, sampler=train_sampler) val_sampler = torch.utils.data.distributed.DistributedSampler( valset, shuffle=False) val_loader = torch.utils.data.DataLoader( valset, batch_size=cfg.TRAINING.TEST_BATCH_SIZE, shuffle=False, num_workers=cfg.TRAINING.WORKERS, pin_memory=True, sampler=val_sampler) # init the model model_kwargs = {} if cfg.MODEL.FINAL_DROP > 0.0: model_kwargs['final_drop'] = cfg.MODEL.FINAL_DROP if cfg.TRAINING.LAST_GAMMA: model_kwargs['last_gamma'] = True model = get_model(cfg.MODEL.NAME)(**model_kwargs) if args.gpu == 0: logger.info(model) criterion, train_loader = get_criterion(cfg, train_loader, args.gpu) model.cuda(args.gpu) criterion.cuda(args.gpu) model = DistributedDataParallel(model, device_ids=[args.gpu]) # criterion and optimizer if cfg.OPTIMIZER.DISABLE_BN_WD: parameters = model.named_parameters() param_dict = {} for k, v in parameters: param_dict[k] = v bn_params = [ v for n, v in param_dict.items() if ('bn' in n or 'bias' in n) ] rest_params = [ v for n, v in param_dict.items() if not ('bn' in n or 'bias' in n) ] if args.gpu == 0: logger.info(" Weight decay NOT applied to BN parameters ") logger.info( f'len(parameters): {len(list(model.parameters()))} = {len(bn_params)} + {len(rest_params)}' ) optimizer = torch.optim.SGD([{ 'params': bn_params, 'weight_decay': 0 }, { 'params': rest_params, 'weight_decay': cfg.OPTIMIZER.WEIGHT_DECAY }], lr=cfg.OPTIMIZER.LR, momentum=cfg.OPTIMIZER.MOMENTUM, weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY) else: optimizer = torch.optim.SGD(model.parameters(), lr=cfg.OPTIMIZER.LR, momentum=cfg.OPTIMIZER.MOMENTUM, weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY) # check point if args.resume is not None: if os.path.isfile(args.resume): if args.gpu == 0: logger.info(f"=> loading checkpoint '{args.resume}'") with PathManager.open(args.resume, "rb") as f: checkpoint = torch.load(f) cfg.TRAINING.START_EPOCHS = checkpoint['epoch'] + 1 if cfg.TRAINING.START_EPOCHS == 0 \ else cfg.TRAINING.START_EPOCHS best_pred = checkpoint['best_pred'] acclist_train = checkpoint['acclist_train'] acclist_val = checkpoint['acclist_val'] model.module.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.gpu == 0: logger.info( f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']})" ) else: raise RuntimeError( f"=> no resume checkpoint found at '{args.resume}'") scheduler = LR_Scheduler(cfg.OPTIMIZER.LR_SCHEDULER, base_lr=cfg.OPTIMIZER.LR, num_epochs=cfg.TRAINING.EPOCHS, iters_per_epoch=len(train_loader), warmup_epochs=cfg.OPTIMIZER.WARMUP_EPOCHS) def train(epoch): train_sampler.set_epoch(epoch) model.train() losses = AverageMeter() top1 = AverageMeter() global best_pred, acclist_train for batch_idx, (data, target) in enumerate(train_loader): scheduler(optimizer, batch_idx, epoch, best_pred) if not cfg.DATA.MIXUP: data, target = data.cuda(args.gpu), target.cuda(args.gpu) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() if not cfg.DATA.MIXUP: acc1 = accuracy(output, target, topk=(1, )) top1.update(acc1[0], data.size(0)) losses.update(loss.item(), data.size(0)) if batch_idx % 100 == 0 and args.gpu == 0: if cfg.DATA.MIXUP: logger.info('Batch: %d| Loss: %.3f' % (batch_idx, losses.avg)) else: logger.info('Batch: %d| Loss: %.3f | Top1: %.3f' % (batch_idx, losses.avg, top1.avg)) acclist_train += [top1.avg] def validate(epoch): model.eval() top1 = AverageMeter() top5 = AverageMeter() global best_pred, acclist_train, acclist_val is_best = False for batch_idx, (data, target) in enumerate(val_loader): data, target = data.cuda(args.gpu), target.cuda(args.gpu) with torch.no_grad(): output = model(data) acc1, acc5 = accuracy(output, target, topk=(1, 5)) top1.update(acc1[0], data.size(0)) top5.update(acc5[0], data.size(0)) # sum all sum1, cnt1, sum5, cnt5 = torch_dist_sum(args.gpu, top1.sum, top1.count, top5.sum, top5.count) top1_acc = sum(sum1) / sum(cnt1) top5_acc = sum(sum5) / sum(cnt5) if args.gpu == 0: logger.info('Validation: Top1: %.3f | Top5: %.3f' % (top1_acc, top5_acc)) if args.eval_only: return top1_acc, top5_acc # save checkpoint acclist_val += [top1_acc] if top1_acc > best_pred: best_pred = top1_acc is_best = True save_checkpoint( { 'epoch': epoch, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'best_pred': best_pred, 'acclist_train': acclist_train, 'acclist_val': acclist_val, }, directory=args.outdir, is_best=False, filename=f'checkpoint_{epoch}.pth') return top1_acc.item(), top5_acc.item() if args.export: if args.gpu == 0: with PathManager.open(args.export + '.pth', "wb") as f: torch.save(model.module.state_dict(), f) return if args.eval_only: top1_acc, top5_acc = validate(cfg.TRAINING.START_EPOCHS) metrics = { "top1": top1_acc, "top5": top5_acc, } if args.gpu == 0: with PathManager.open(os.path.join(args.outdir, 'metrics.json'), "w") as f: json.dump(metrics, f) return for epoch in range(cfg.TRAINING.START_EPOCHS, cfg.TRAINING.EPOCHS): tic = time.time() train(epoch) if epoch % 10 == 0: top1_acc, top5_acc = validate(epoch) elapsed = time.time() - tic if args.gpu == 0: logger.info(f'Epoch: {epoch}, Time cost: {elapsed}') # final evaluation top1_acc, top5_acc = validate(cfg.TRAINING.START_EPOCHS - 1) if args.gpu == 0: # save final checkpoint save_checkpoint( { 'epoch': cfg.TRAINING.EPOCHS - 1, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'best_pred': best_pred, 'acclist_train': acclist_train, 'acclist_val': acclist_val, }, directory=args.outdir, is_best=False, filename='checkpoint_final.pth') # save final model weights with PathManager.open(os.path.join(args.outdir, 'model_weights.pth'), "wb") as f: torch.save(model.module.state_dict(), f) metrics = { "top1": top1_acc, "top5": top5_acc, } with PathManager.open(os.path.join(args.outdir, 'metrics.json'), "w") as f: json.dump(metrics, f)
def prepare(self, ckpt_dir: str, optimizer: str = 'lars', learning_rate: float = 0.2, weight_decay: float = 1.5 * 1e-6, cosine_warmup: int = 10, cosine_cycles: int = 1, cosine_min_lr: float = 0., epochs: int = 1000, batch_size: int = 256, num_workers: int = 0, distributed: bool = False, local_rank: int = 0, mixed_precision: bool = True, resume: str = None): """Prepare BYOL pre-training.""" # Set attributes self.ckpt_dir = ckpt_dir self.epochs = epochs self.batch_size = batch_size self.num_workers = num_workers self.distributed = distributed self.local_rank = local_rank self.mixed_precision = mixed_precision self.resume = resume self.optimizer = get_optimizer( params=[ { 'params': self.online_net.parameters() }, { 'params': self.online_predictor.parameters() }, ], name=optimizer, lr=learning_rate, weight_decay=weight_decay # TODO: remove params from batch norm ) self.scheduler = get_cosine_scheduler( self.optimizer, epochs=self.epochs, warmup_steps=cosine_warmup, cycles=cosine_cycles, min_lr=cosine_min_lr, ) # Resuming from previous checkpoint (optional) if resume is not None: if not os.path.exists(resume): raise FileNotFoundError self.load_model_from_checkpoint(resume) # Distributed training (optional, disabled by default.) if distributed: self.online_net = DistributedDataParallel( module=self.online_net.to(local_rank), device_ids=[local_rank]) self.online_predictor = DistributedDataParallel( module=self.online_predictor.to(local_rank), device_ids=[local_rank]) else: self.online_net.to(local_rank) self.online_predictor.to(local_rank) # No DDP wrapping for target network; no gradient updates self.target_net.to(local_rank) # Mixed precision training (optional, enabled by default) self.scaler = torch.cuda.amp.GradScaler() if mixed_precision else None # TensorBoard self.writer = SummaryWriter(ckpt_dir) if local_rank == 0 else None # Ready to train self.prepared = True
def train(): print("started") args = parser.parse_args() if not os.path.exists( args.output_dir ): # !!!NOTICE: change output dir for each different settings. os.makedirs(args.output_dir) print(args.output_dir) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) # Load pretrained model and tokenizer config_class, model_class, tokenizer_class = MODEL_CLASSES["gpt2"] config = config_class.from_pretrained(args.model_name_or_path) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) tokenizer.add_tokens(SPECIAL_TOKENS) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) # model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.resize_token_embeddings(len(tokenizer)) model.to(args.device) # Prepare optimizer and schedule (linear warmup and decay) # optimizer = OpenAIAdam(model.parameters(), lr=args.lr) optimizer = AdamW(model.parameters(), lr=args.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) cur_input_ids = batch[0] cur_lm_labels = batch[1] cur_token_type_ids = batch[2] model_outputs = model(input_ids=cur_input_ids, labels=cur_lm_labels, token_type_ids=cur_token_type_ids) lm_loss = model_outputs[0] loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, lm_labels, token_type_ids = batch # logger.info(tokenizer.decode(input_ids[0, :].tolist())) model_outputs = model(input_ids, token_type_ids=token_type_ids) lm_logits = model_outputs[0] lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) # tb_logger = TensorboardLogger(log_dir=args.output_dir) # tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) # tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint( args.output_dir, 'checkpoint', save_interval=1, n_saved=3 ) # !!!NOTICE: if fill exist, it will report error. set require_empty=False can avoid this. trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(args, args.output_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(args.output_dir, CONFIG_NAME)) tokenizer.save_vocabulary(args.output_dir) # Run the training print(train_loader) trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(args.output_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner)
def train(proc_id, n_gpus, args, dataset, g, feats, paper_offset): dev_id = devices[proc_id] if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12346') world_size = n_gpus torch.distributed.init_process_group(backend='nccl', init_method=dist_init_method, world_size=world_size, rank=proc_id) torch.cuda.set_device(dev_id) print('Loading masks and labels') train_idx = torch.LongTensor(dataset.get_idx_split('train')) + paper_offset valid_idx = torch.LongTensor(dataset.get_idx_split('valid')) + paper_offset label = dataset.paper_label print('Initializing dataloader...') sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 25]) train_collator = ExternalNodeCollator(g, train_idx, sampler, paper_offset, feats, label) valid_collator = ExternalNodeCollator(g, valid_idx, sampler, paper_offset, feats, label) # Necessary according to https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html train_sampler = torch.utils.data.distributed.DistributedSampler( train_collator.dataset, num_replicas=world_size, rank=proc_id, shuffle=True, drop_last=False) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_collator.dataset, num_replicas=world_size, rank=proc_id, shuffle=True, drop_last=False) train_dataloader = torch.utils.data.DataLoader( train_collator.dataset, batch_size=1024, collate_fn=train_collator.collate, num_workers=4, sampler=train_sampler) valid_dataloader = torch.utils.data.DataLoader( valid_collator.dataset, batch_size=1024, collate_fn=valid_collator.collate, num_workers=2, sampler=valid_sampler) print('Initializing model...') model = RGAT(dataset.num_paper_features, dataset.num_classes, 1024, 5, 2, 4, 0.5, 'paper').to(dev_id) # convert BN to SyncBatchNorm. see https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html model = nn.SyncBatchNorm.convert_sync_batchnorm(model) model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) opt = torch.optim.Adam(model.parameters(), lr=0.001) sched = torch.optim.lr_scheduler.StepLR(opt, step_size=25, gamma=0.25) best_acc = 0 for i in range(args.epochs): # make shuffling work properly across multiple epochs. # see https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler train_sampler.set_epoch(i) model.train() with tqdm.tqdm(train_dataloader) as tq: for i, (input_nodes, output_nodes, mfgs) in enumerate(tq): mfgs = [g.to(dev_id) for g in mfgs] x = mfgs[0].srcdata['x'] y = mfgs[-1].dstdata['y'] y_hat = model(mfgs, x) loss = F.cross_entropy(y_hat, y) opt.zero_grad() loss.backward() opt.step() acc = (y_hat.argmax(1) == y).float().mean() tq.set_postfix( { 'loss': '%.4f' % loss.item(), 'acc': '%.4f' % acc.item() }, refresh=False) # eval in each process model.eval() correct = torch.LongTensor([0]).to(dev_id) total = torch.LongTensor([0]).to(dev_id) for i, (input_nodes, output_nodes, mfgs) in enumerate(tqdm.tqdm(valid_dataloader)): with torch.no_grad(): mfgs = [g.to(dev_id) for g in mfgs] x = mfgs[0].srcdata['x'] y = mfgs[-1].dstdata['y'] y_hat = model(mfgs, x) correct += (y_hat.argmax(1) == y).sum().item() total += y_hat.shape[0] # `reduce` data into process 0 torch.distributed.reduce(correct, dst=0, op=torch.distributed.ReduceOp.SUM) torch.distributed.reduce(total, dst=0, op=torch.distributed.ReduceOp.SUM) acc = (correct / total).item() sched.step() # process 0 print accuracy and save model if proc_id == 0: print('Validation accuracy:', acc) if best_acc < acc: best_acc = acc print('Updating best model...') torch.save(model.state_dict(), args.model_path)
def run(proc_id, n_gpus, args, devices, data): # Unpack data device = th.device(devices[proc_id]) if n_gpus > 0: th.cuda.set_device(device) if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus th.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=proc_id) train_nid, val_nid, test_nid, n_classes, g, nfeat, labels = data if args.data_device == 'gpu': nfeat = nfeat.to(device) labels = labels.to(device) elif args.data_device == 'uva': nfeat = dgl.contrib.UnifiedTensor(nfeat, device=device) labels = dgl.contrib.UnifiedTensor(labels, device=device) in_feats = nfeat.shape[1] # Create PyTorch DataLoader for constructing blocks n_edges = g.num_edges() train_seeds = th.arange(n_edges) if args.graph_device == 'gpu': train_seeds = train_seeds.to(device) g = g.to(device) args.num_workers = 0 elif args.graph_device == 'uva': train_seeds = train_seeds.to(device) g.pin_memory_() args.num_workers = 0 # Create sampler sampler = dgl.dataloading.MultiLayerNeighborSampler( [int(fanout) for fanout in args.fan_out.split(',')]) dataloader = dgl.dataloading.EdgeDataLoader( g, train_seeds, sampler, exclude='reverse_id', # For each edge with ID e in Reddit dataset, the reverse edge is e ± |E|/2. reverse_eids=th.cat( [th.arange(n_edges // 2, n_edges), th.arange(0, n_edges // 2)]).to(train_seeds), negative_sampler=NegativeSampler( g, args.num_negs, args.neg_share, device if args.graph_device == 'uva' else None), device=device, use_ddp=n_gpus > 1, batch_size=args.batch_size, shuffle=True, drop_last=False, num_workers=args.num_workers) # Define model and optimizer model = SAGE(in_feats, args.num_hidden, args.num_hidden, args.num_layers, F.relu, args.dropout) model = model.to(device) if n_gpus > 1: model = DistributedDataParallel(model, device_ids=[device], output_device=device) loss_fcn = CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) # Training loop avg = 0 iter_pos = [] iter_neg = [] iter_d = [] iter_t = [] best_eval_acc = 0 best_test_acc = 0 for epoch in range(args.num_epochs): tic = time.time() # Loop over the dataloader to sample the computation dependency graph as a list of # blocks. tic_step = time.time() for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(dataloader): batch_inputs = nfeat[input_nodes].to(device) pos_graph = pos_graph.to(device) neg_graph = neg_graph.to(device) blocks = [block.int().to(device) for block in blocks] d_step = time.time() # Compute loss and prediction batch_pred = model(blocks, batch_inputs) loss = loss_fcn(batch_pred, pos_graph, neg_graph) optimizer.zero_grad() loss.backward() optimizer.step() t = time.time() pos_edges = pos_graph.num_edges() neg_edges = neg_graph.num_edges() iter_pos.append(pos_edges / (t - tic_step)) iter_neg.append(neg_edges / (t - tic_step)) iter_d.append(d_step - tic_step) iter_t.append(t - d_step) if step % args.log_every == 0 and proc_id == 0: gpu_mem_alloc = th.cuda.max_memory_allocated( ) / 1000000 if th.cuda.is_available() else 0 print( '[{}]Epoch {:05d} | Step {:05d} | Loss {:.4f} | Speed (samples/sec) {:.4f}|{:.4f} | Load {:.4f}| train {:.4f} | GPU {:.1f} MB' .format(proc_id, epoch, step, loss.item(), np.mean(iter_pos[3:]), np.mean(iter_neg[3:]), np.mean(iter_d[3:]), np.mean(iter_t[3:]), gpu_mem_alloc)) tic_step = time.time() if step % args.eval_every == 0 and proc_id == 0: eval_acc, test_acc = evaluate(model, g, nfeat, labels, train_nid, val_nid, test_nid, device) print('Eval Acc {:.4f} Test Acc {:.4f}'.format( eval_acc, test_acc)) if eval_acc > best_eval_acc: best_eval_acc = eval_acc best_test_acc = test_acc print('Best Eval Acc {:.4f} Test Acc {:.4f}'.format( best_eval_acc, best_test_acc)) toc = time.time() if proc_id == 0: print('Epoch Time(s): {:.4f}'.format(toc - tic)) if epoch >= 5: avg += toc - tic if n_gpus > 1: th.distributed.barrier() if proc_id == 0: print('Avg epoch time: {}'.format(avg / (epoch - 4)))
def main(config, embedding, model_path, run, imagedataset, local_rank, resnet, bkg): rank, world_size, device_id, device = setup(local_rank) print("Local rank: {} Rank: {} World Size: {} Device_id: {} Device: {}". format(local_rank, rank, world_size, device_id, device)) pth_extn = '.pth.tar' # Configuration CONFIG = Dict(yaml.load(open(config))) datadir = os.path.join('data/datasets', imagedataset) print("Split dir: ", datadir) savedir = osp.dirname(model_path) epoch = re.findall("checkpoint_(.*)\." + pth_extn[1:], osp.basename(model_path))[-1] if run == 'zlss' or run == 'flss': val = np.load(datadir + '/split/test_list.npy') visible_classes = np.load(datadir + '/split/novel_cls.npy') if bkg: visible_classes = np.asarray(np.concatenate( [np.array([0]), visible_classes]), dtype=int) elif run == 'gzlss' or run == 'gflss': val = np.load(datadir + '/split/test_list.npy') vals_cls = np.asarray(np.concatenate([ np.load(datadir + '/split/seen_cls.npy'), np.load(datadir + '/split/val_cls.npy') ]), dtype=int) if bkg: vals_cls = np.asarray(np.concatenate([np.array([0]), vals_cls]), dtype=int) valu_cls = np.load(datadir + '/split/novel_cls.npy') visible_classes = np.concatenate([vals_cls, valu_cls]) else: print("invalid run ", run) sys.exit() cls_map = np.array([255] * 256) for i, n in enumerate(visible_classes): cls_map[n] = i if run == 'gzlss' or run == 'gflss': novel_cls_map = np.array([255] * 256) for i, n in enumerate(list(valu_cls)): novel_cls_map[cls_map[n]] = i seen_cls_map = np.array([255] * 256) for i, n in enumerate(list(vals_cls)): seen_cls_map[cls_map[n]] = i all_labels = np.genfromtxt(datadir + '/labels_2.txt', delimiter='\t', usecols=1, dtype='str') print("Visible Classes: ", visible_classes) # Dataset dataset = get_dataset(CONFIG.DATASET)( train=None, test=val, root=CONFIG.ROOT, split=CONFIG.SPLIT.TEST, base_size=CONFIG.IMAGE.SIZE.TEST, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), warp=CONFIG.WARP_IMAGE, scale=None, flip=False, ) random.seed(42) if embedding == 'word2vec': class_emb = pickle.load( open(datadir + '/word_vectors/word2vec.pkl', "rb")) elif embedding == 'fasttext': class_emb = pickle.load( open(datadir + '/word_vectors/fasttext.pkl', "rb")) elif embedding == 'fastnvec': class_emb = np.concatenate([ pickle.load(open(datadir + '/word_vectors/fasttext.pkl', "rb")), pickle.load(open(datadir + '/word_vectors/word2vec.pkl', "rb")) ], axis=1) else: print("invalid emb ", embedding) sys.exit() class_emb = class_emb[visible_classes] class_emb = F.normalize(torch.tensor(class_emb), p=2, dim=1).cuda() print("Embedding dim: ", class_emb.shape[1]) print("# Visible Classes: ", class_emb.shape[0]) # DataLoader loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=CONFIG.BATCH_SIZE.TEST, num_workers=CONFIG.NUM_WORKERS, shuffle=False, sampler=DistributedSampler( dataset, num_replicas=world_size, rank=rank, shuffle=False), pin_memory=True, drop_last=True) torch.set_grad_enabled(False) # Model model = DeepLabV2_ResNet101_MSC(class_emb.shape[1], class_emb, resnet=resnet) state_dict = torch.load(model_path, map_location='cpu') model = DistributedDataParallel(model.to(device), device_ids=[rank]) new_state_dict = OrderedDict() if resnet == 'spnet': for k, v in state_dict['state_dict'].items(): name = k.replace("scale", "base") # 'scale'->base name = name.replace("stages.", "") new_state_dict[name] = v else: new_state_dict = state_dict['state_dict'] model.load_state_dict(new_state_dict) del state_dict model.eval() targets, outputs = [], [] loader_iter = iter(loader) iterations = len(loader_iter) print("Iterations: {}".format(iterations)) pbar = tqdm(loader, total=iterations, leave=False, dynamic_ncols=True, position=rank) for iteration in pbar: data, target, img_id = next(loader_iter) # Image data = data.to(device) # Forward propagation output = model(data) output = F.interpolate(output, size=data.shape[2:], mode="bilinear", align_corners=False) output = F.softmax(output, dim=1) target = cls_map[target.numpy()] remote_target = torch.tensor(target).to(device) if rank == 0: remote_target = torch.zeros_like(remote_target).to(device) output = torch.argmax(output, dim=1).cpu().numpy() remote_output = torch.tensor(output).to(device) if rank == 0: remote_output = torch.zeros_like(remote_output).to(device) for o, t in zip(output, target): outputs.append(o) targets.append(t) torch.distributed.reduce(remote_output, dst=0) torch.distributed.reduce(remote_target, dst=0) torch.distributed.barrier() if rank == 0: remote_output = remote_output.cpu().numpy() remote_target = remote_target.cpu().numpy() for o, t in zip(remote_output, remote_target): outputs.append(o) targets.append(t) if rank == 0: if run == 'gzlss' or run == 'gflss': score, class_iou = scores_gzsl(targets, outputs, n_class=len(visible_classes), seen_cls=cls_map[vals_cls], unseen_cls=cls_map[valu_cls]) else: score, class_iou = scores(targets, outputs, n_class=len(visible_classes)) for k, v in score.items(): print(k, v) score["Class IoU"] = {} for i in range(len(visible_classes)): score["Class IoU"][all_labels[visible_classes[i]]] = class_iou[i] name = "" name = model_path.replace(pth_extn, "_" + run + ".json") if bkg == True: with open(name.replace('.json', '_bkg.json'), "w") as f: json.dump(score, f, indent=4, sort_keys=True) else: with open(name, "w") as f: json.dump(score, f, indent=4, sort_keys=True) print(score["Class IoU"]) return
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default= "/Users/tetracycline/repos/datascience/datascience/projects/counsel_chat_all_data_300-tokens.json", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def __init__(self, args, train_samples, dev_samples, dev_ace_samples, test_ace_samples, cpt_model, id2cpt, id2et, cpt_id2et_id): self.id2cpt = id2cpt self.id2et = id2et self.cpt_id2et_id = cpt_id2et_id # init distributed self.device = torch.device("cuda:{}".format(args.local_rank) if torch. cuda.is_available() else "cpu") self.logger = logger.setLevel(logging.INFO if dist.get_rank() == 0 else logging.WARNING) # Setup logging # 同步start_time sync_time = torch.tensor(time.time(), dtype=torch.double).to(self.device) dist.broadcast(sync_time, src=0) # self.start_time = datetime.fromtimestamp(sync_time.item()).strftime('%Y-%m-%d-%H-%M-%S-%f') self.n_gpu = len(args.device_id.split(',')) self.rank = args.local_rank self.world_size = dist.get_world_size() # self.model = cpt_model self.logger = logger self.logger.info("model name: {}".format("CptNllODEE")) self.logger.info("add_mlm_object tag: {}".format(args.add_mlm_object)) self.args = args random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # init data loader # ************** train data ************************ # train_sampler = DistributedSampler(train_samples) self.train_loader = train_samples # ************** dev data ************************ self.dev_loader = dev_samples self.dev_ace_loader = dev_ace_samples # ************** test data ************************ # self.test_loader = DataLoader(test_ace_samples, batch_size=args.per_gpu_eval_batch_size, # collate_fn=test_ace_samples.collate_fn) cpt_model.to(self.device) self.optim_mode = 'AdamW' self.n_steps = len(self.train_loader) * args.train_epoch_num self.logger.info("dataloader length: {}".format(len( self.train_loader))) self.print_step = self.args.train_record_steps self.update_step = self.args.gradient_accumulation_steps self.dev_step = self.args.dev_record_steps * self.args.gradient_accumulation_steps / self.args.gradient_average self.test_step = self.args.test_record_steps * self.args.gradient_accumulation_steps / self.args.gradient_average self.optimizer, self.scheduler = adam_optimizer( args, cpt_model, self.optim_mode, t_total=self.n_steps, warmup_steps=int(self.n_steps * args.warmup_ratio)) # init fp16, must before DataParallel init if len(args.fp16): assert isinstance( args.fp16, str ), "Please set Apex AMP optimization level selected in ['O0', 'O1', 'O2', 'O3']" cpt_model, self.optimizer = amp.initialize(cpt_model, self.optimizer, opt_level=args.fp16) # init DataParallel self.ddp_model = DistributedDataParallel(cpt_model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) self.model = self.ddp_model.module self.logger.info("Setup Distributed Trainer") self.logger.warning( "Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}". format(os.getpid(), self.rank, args.local_rank, self.device, args.fp16 if args.fp16 else False)) self.logger.info("Num of processes: {}".format(self.world_size)) self.logger.info("Use device: {}".format(self.device)) self.logger.info( "Training with fp16: {}, optimization level: {}".format( len(args.fp16) > 0, args.fp16 if args.fp16 else None))
def train_detector(model, dataset, cfg, distributed=False, validate=False, logger=None): if logger is None: logger = get_root_logger(cfg.log_level) # start training # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=distributed ) for ds in dataset ] total_steps = cfg.total_epochs * len(data_loaders[0]) # print(f"total_steps: {total_steps}") if cfg.lr_config.type == "one_cycle": # build trainer optimizer = build_one_cycle_optimizer(model, cfg.optimizer) lr_scheduler = _create_learning_rate_scheduler( optimizer, cfg.lr_config, total_steps ) cfg.lr_config = None else: optimizer = build_optimizer(model, cfg.optimizer) lr_scheduler = None # put model on gpus if distributed: # model = apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel( model.cuda(cfg.local_rank), device_ids=[cfg.local_rank], output_device=cfg.local_rank, # broadcast_buffers=False, find_unused_parameters=True, ) else: model = model.cuda() logger.info(f"model structure: {model}") trainer = Trainer( model, batch_processor, optimizer, lr_scheduler, cfg.work_dir, cfg.log_level ) if distributed: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks trainer.register_training_hooks( cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config ) if distributed: trainer.register_hook(DistSamplerSeedHook()) # # register eval hooks # if validate: # val_dataset_cfg = cfg.data.val # eval_cfg = cfg.get('evaluation', {}) # dataset_type = DATASETS.get(val_dataset_cfg.type) # trainer.register_hook( # KittiEvalmAPHookV2(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: trainer.resume(cfg.resume_from) elif cfg.load_from: trainer.resume(cfg.load_from, load_only_weights=True) # import ipdb; ipdb.set_trace() trainer.run(data_loaders, cfg.workflow, cfg.total_epochs, local_rank=cfg.local_rank)
def main(args): # model = modeling.VOSNet(model=args.model).cuda() # model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = modeling.VOSNet(model=args.model, sync_bn=True).cuda() model = DistributedDataParallel(model, device_ids=[args.local_rank], broadcast_buffers=False) criterion = CrossEntropy(temperature=args.temperature).cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=True, weight_decay=args.wd) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=4e-5) if args.dataset == 'davis': train_dataset = dataset.DavisTrain( os.path.join(args.data, 'DAVIS_train/JPEGImages/480p'), os.path.join(args.data, 'DAVIS_train/Annotations/480p'), frame_num=args.frame_num, color_jitter=args.cj) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.bs // dist.get_world_size(), shuffle=False, sampler=train_sampler, pin_memory=True, num_workers=4 // dist.get_world_size(), drop_last=True) val_dataset = dataset.DavisTrain( os.path.join(args.data, 'DAVIS_val/JPEGImages/480p'), os.path.join(args.data, 'DAVIS_val/Annotations/480p'), frame_num=args.frame_num, color_jitter=args.cj) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.bs // dist.get_world_size(), shuffle=False, sampler=val_sampler, pin_memory=True, num_workers=4 // dist.get_world_size(), drop_last=True) else: raise NotImplementedError start_epoch = 0 if args.resume: if os.path.isfile(args.resume): logger.info("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: logger.info("=> no checkpoint found at '{}'".format(args.resume)) for epoch in range(start_epoch, start_epoch + args.epochs): train_loss = train(train_loader, model, criterion, optimizer, epoch, args) with torch.no_grad(): val_loss = validate(val_loader, model, criterion, args) scheduler.step() if dist.get_rank() == 0: os.makedirs(args.save_model, exist_ok=True) checkpoint_name = 'checkpoint-epoch-{}.pth.tar'.format(epoch) save_path = os.path.join(args.save_model, checkpoint_name) torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, save_path)
def main(): args = parse_args() utils.gpu_affinity.set_affinity(args.local_rank) # Initialize device and distributed backend torch.cuda.set_device(args.local_rank) l2_promote() device = torch.device('cuda' if args.cuda else 'cpu') utils.distributed.init_distributed(args.cuda) args.work_dir = utils.exp_utils.build_work_dir_name( args.work_dir, args.dataset, args.append_dataset, args.append_time, ) with utils.distributed.sync_workers() as rank: if rank == 0: create_exp_dir(args.work_dir, scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug) # Setup logging if args.log_all_ranks: log_file = f'train_log_rank_{utils.distributed.get_rank()}.log' else: log_file = args.txtlog_file dllog_file = args.dllog_file log_file = os.path.join(args.work_dir, log_file) dllog_file = os.path.join(args.work_dir, dllog_file) if args.debug: log_file = os.devnull dllog_file = os.devnull utils.exp_utils.setup_logging( log_all_ranks=args.log_all_ranks, filename=log_file, ) utils.exp_utils.setup_dllogger(enabled=True, filename=dllog_file) if args.local_batch_size is not None: world_size = utils.distributed.get_world_size() args.batch_size = world_size * args.local_batch_size logging.info(f'--local_batch_size was set, adjusting global batch size' f' to {args.batch_size} (local_batch_size * world_size)') logging.info(args) dllogger.log(step='PARAMETER', data=vars(args)) logging.info(f'world size: {utils.distributed.get_world_size()}') if not args.no_env: log_env_info() register_ignoring_timeout_handler() # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) ########################################################################### # Load data ########################################################################### corpus = get_lm_corpus(args.data, args.dataset, args.vocab) ntokens = len(corpus.vocab) vocab = corpus.vocab args.n_token = ntokens if args.mem_len == 0: eval_mem_len = 0 else: eval_mem_len = args.mem_len + args.tgt_len - args.eval_tgt_len tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len) va_iter = corpus.get_iterator('valid', args.eval_batch_size, args.eval_tgt_len, device=device, mem_len=eval_mem_len, ext_len=args.ext_len) te_iter = corpus.get_iterator('test', args.eval_batch_size, args.eval_tgt_len, device=device, mem_len=eval_mem_len, ext_len=args.ext_len) # adaptive softmax / embedding cutoffs, tie_projs = [], [False] if args.adaptive: assert args.dataset in ['wt103', 'lm1b'] if args.dataset == 'wt103': cutoffs = [19997, 39997, 199997] tie_projs += [True] * len(cutoffs) elif args.dataset == 'lm1b': cutoffs = [59997, 99997, 639997] tie_projs += [False] * len(cutoffs) ########################################################################### # Build the model ########################################################################### model_config = { 'n_token': ntokens, 'n_layer': args.n_layer, 'n_head': args.n_head, 'd_model': args.d_model, 'd_head': args.d_head, 'd_inner': args.d_inner, 'dropout': args.dropout, 'dropatt': args.dropatt, 'dtype': None, 'tie_weight': args.tied, 'd_embed': args.d_embed, 'div_val': args.div_val, 'tie_projs': tie_projs, 'pre_lnorm': args.pre_lnorm, 'tgt_len': args.tgt_len, 'ext_len': args.ext_len, 'mem_len': args.mem_len, 'cutoffs': cutoffs, 'same_length': args.same_length, 'attn_type': args.attn_type, 'clamp_len': args.clamp_len, 'sample_softmax': args.sample_softmax, } model = MemTransformerLM(**model_config) model.apply(functools.partial(weights_init, args=args)) # ensure embedding init is not overridden by out_layer in case of weight sharing model.word_emb.apply(functools.partial(weights_init, args=args)) args.n_all_param = sum([p.nelement() for p in model.parameters()]) args.n_nonemb_param = sum( [p.nelement() for p in model.layers.parameters()]) # optimizer if args.optim.lower() == 'sgd': if args.sample_softmax > 0: dense_params, sparse_params = [], [] for param in model.parameters(): if param.size() == model.word_emb.weight.size(): sparse_params.append(param) else: dense_params.append(param) optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.mom) optimizer_sparse = None elif args.optim.lower() == 'adam': if args.sample_softmax > 0: dense_params, sparse_params = [], [] for param in model.parameters(): if param.size() == model.word_emb.weight.size(): sparse_params.append(param) else: dense_params.append(param) optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) optimizer = optim.Adam(dense_params, lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) optimizer_sparse = None elif args.optim.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr) optimizer_sparse = None elif args.optim.lower() == 'lamb': optimizer = lamb.Lamb(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) optimizer_sparse = None elif args.optim.lower() == 'jitlamb': optimizer = lamb.JITLamb(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) optimizer_sparse = None model = model.to(device) scaler = None if args.fp16: if args.amp == 'pytorch': scaler = torch.cuda.amp.GradScaler() elif args.amp == 'apex': model, optimizer = amp.initialize( model, optimizer, opt_level=args.apex_amp_opt_level, ) if args.multi_gpu == 'ddp' and torch.distributed.is_initialized(): para_model = DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, broadcast_buffers=False, find_unused_parameters=True, ) elif args.multi_gpu == 'dp': if args.gpu0_bsz >= 0: para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk, model, dim=1).to(device) else: para_model = nn.DataParallel(model, dim=1).to(device) else: para_model = model # scheduler if args.scheduler == 'cosine': if args.max_step_scheduler: max_step = args.max_step_scheduler else: max_step = args.max_step scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, max_step - args.warmup_step, eta_min=args.eta_min) if args.sample_softmax > 0 and optimizer_sparse is not None: scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR( optimizer_sparse, max_step - args.warmup_step, eta_min=args.eta_min) else: scheduler_sparse = None elif args.scheduler == 'inv_sqrt': # originally used for Transformer (in Attention is all you need) def lr_lambda(step): # return a multiplier instead of a learning rate if step == 0 and args.warmup_step == 0: return 1. else: return 1. / (step ** 0.5) if step > args.warmup_step \ else step / (args.warmup_step ** 1.5) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) if args.sample_softmax > 0 and optimizer_sparse is not None: scheduler_sparse = optim.lr_scheduler.LambdaLR(optimizer_sparse, lr_lambda=lr_lambda) else: scheduler_sparse = None elif args.scheduler == 'dev_perf': scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min, ) if args.sample_softmax > 0 and optimizer_sparse is not None: scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau( optimizer_sparse, factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min, ) else: scheduler_sparse = None elif args.scheduler == 'constant': pass logging.info('=' * 100) for k, v in args.__dict__.items(): logging.info(' - {} : {}'.format(k, v)) logging.info('=' * 100) logging.info('#params = {}'.format(args.n_all_param)) logging.info('#non emb params = {}'.format(args.n_nonemb_param)) train_step = 0 start_epoch = 1 last_batch = 0 last_iter = 0 best_val_loss = None if args.restart: try: checkpoint = load_checkpoint(args.restart) model.load_state_dict(checkpoint['model_state']) optimizer.load_state_dict(checkpoint['optimizer_state']) scheduler.load_state_dict(checkpoint['scheduler_state']) if args.fp16: if args.amp == 'pytorch': scaler.load_state_dict(checkpoint['amp_state']) elif args.amp == 'apex': amp.load_state_dict(checkpoint['amp_state']) train_step = checkpoint['train_step'] start_epoch = checkpoint['epoch'] last_batch = checkpoint['batch'] last_iter = checkpoint['last_iter'] best_val_loss = checkpoint['best_val_loss'] if train_step >= args.max_step: logging.info( f'Loaded checkpoint after {train_step} steps, but ' f'this run was scheduled for a total of ' f'{args.max_step} steps, exiting') sys.exit(1) model.apply(functools.partial(update_dropout, args=args)) model.apply(functools.partial(update_dropatt, args=args)) except FileNotFoundError: logging.info(f'Could not load checkpoint from {args.restart}, ' f'starting training from random init') meters = {} warmup = args.mem_len // args.tgt_len + 2 meters['train_throughput'] = AverageMeter(warmup=warmup) ########################################################################### # Train ########################################################################### # Loop over epochs. # At any point you can hit Ctrl + C to break out of training early. start_time = time.time() with TimeoutHandler() as timeout_handler: try: for epoch in itertools.count(start=start_epoch): if args.roll: tr_iter.roll(seed=args.seed + epoch) train_step, best_val_loss = train( tr_iter, va_iter, model, para_model, model_config, optimizer, optimizer_sparse, scheduler, scheduler_sparse, scaler, vocab, epoch, last_batch, last_iter, train_step, best_val_loss, meters, timeout_handler, device, args) last_batch = 0 last_iter = 0 if train_step == args.max_step: logging.info('-' * 100) logging.info('End of training') break except KeyboardInterrupt: logging.info('-' * 100) logging.info('Exiting from training early') elapsed = time.time() - start_time ########################################################################### # Test ########################################################################### summary = {} test_path = os.path.join(args.work_dir, 'checkpoint_best.pt') if not args.debug and not args.no_eval and os.path.exists(test_path): # Load the best saved model. checkpoint = load_checkpoint(test_path) model.load_state_dict(checkpoint['model_state']) # Run on test data. test_start_time = time.time() test_loss = evaluate(te_iter, model, args) test_loss = utils.distributed.all_reduce_item(test_loss, 'mean') test_elapsed = time.time() - test_start_time logging.info('=' * 100) if args.dataset in ['enwik8', 'text8']: logging.info( '| End of training | test time: {:5.2f}s | test loss {:5.2f} | test bpc {:9.5f}' .format(test_elapsed, test_loss, test_loss / math.log(2))) else: logging.info( '| End of training | test time: {:5.2f}s | test loss {:5.2f} | test ppl {:9.3f}' .format(test_elapsed, test_loss, math.exp(test_loss))) logging.info('=' * 100) summary.update({ 'test_elapsed': test_elapsed, 'test_loss': test_loss, }) if args.dataset in ['enwik8', 'text8']: summary['test_bits_per_character'] = test_loss / math.log(2) else: summary['test_perplexity'] = math.exp(test_loss) logging.info(f'Training time: {(elapsed / 60):.2f} minutes') logging.info( f'Training throughput: {meters["train_throughput"].avg:.2f} tok/s') if best_val_loss: val_perplexity = math.exp(best_val_loss) else: val_perplexity = None summary.update({ 'train_throughput': meters['train_throughput'].avg, 'train_elapsed': elapsed / 60, 'valid_loss': best_val_loss, 'valid_perplexity': val_perplexity, }) dllogger.log(step=tuple(), data=summary) passed = benchmark(target_perplexity=args.target_perplexity, test_perplexity=val_perplexity, target_throughput=args.target_throughput, test_throughput=meters['train_throughput'].avg) if not passed: sys.exit(1)
def train_ai2thor(model, args, rank=0, b=None): seed = args.seed + 10000 * rank torch.manual_seed(seed) np.random.seed(seed) # torch.cuda.set_device(rank) device = torch.device(f'cuda:{rank}') os.environ['DISPLAY'] = f':{rank}' model = model.to(device) model.share_memory() # Experience buffer storage = PPOBuffer(model.obs_shape, args.steps, args.num_workers, args.state_size, args.gamma, device=device) storage.share_memory() #torch.multiprocessing.set_start_method('spawn') # start multiple processes ready_to_works = [Event() for _ in range(args.num_workers)] exit_flag = Value('i', 0) queue = SimpleQueue() processes = [] task_config_file = "config_files/multiMugTaskTrain.json" # start workers for worker_id in range(args.num_workers): p = Process(target=worker, args=(worker_id, model, storage, ready_to_works[worker_id], queue, exit_flag, task_config_file)) p.start() processes.append(p) # start trainer train_params = { "epochs": args.epochs, "steps": args.steps, "world_size": args.world_size, "num_workers": args.num_workers } ppo_params = { "clip_param": args.clip_param, "train_iters": args.train_iters, "mini_batch_size": args.mini_batch_size, "value_loss_coef": args.value_loss_coef, "entropy_coef": args.entropy_coef, "rnn_steps": args.rnn_steps, "lr": args.lr, "max_kl": args.max_kl } distributed = False if args.world_size > 1: distributed = True # Initialize Process Group, distributed backend type dist_backend = 'nccl' # Url used to setup distributed training dist_url = "tcp://127.0.0.1:23456" print("Initialize Process Group... pid:", os.getpid()) dist.init_process_group(backend=dist_backend, init_method=dist_url, rank=rank, world_size=args.world_size) # Make model DistributedDataParallel model = DistributedDataParallel(model, device_ids=[rank], output_device=rank) learner(model, storage, train_params, ppo_params, ready_to_works, queue, exit_flag, rank, distributed, b) for p in processes: print("process ", p.pid, " joined") p.join()
def train(procid, args): # load and preprocess dataset assert procid >= 0 os.environ['MASTER_ADDR'] = args.MASTER_ADDR os.environ['MASTER_PORT'] = args.MASTER_PORT if args.dataset == 'cora': data = CoraGraphDataset() elif args.dataset == 'citeseer': data = CiteseerGraphDataset() elif args.dataset == 'pubmed': data = PubmedGraphDataset() elif args.dataset == 'reddit': data = RedditDataset() else: raise ValueError('Unknown dataset: {}'.format(args.dataset)) g = data[0] #data = args.data #g = args.data[0] #g.create_formats_() print("New Proc! ", procid) #return g device = torch.device(args.devices_name_list[procid]) torch.cuda.set_device(device) dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip=args.MASTER_ADDR, master_port=args.MASTER_PORT) world_size = args.ngpus torch.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=procid) #torch.cuda.set_device(device) #st = pg.Storage(g,[device],[args.PV_list[procid]],[args.TV_list[procid]]) # use pagraph st = pg.Storage(g=g, data=g.ndata, cache_rate=args.cache_rate, nodes=args.PV_list[procid], gpu=args.devices_name_list[procid], cpu='cpu') if (True): features = g.ndata.pop('feat') labels = g.ndata.pop('label') train_mask = g.ndata.pop('train_mask') val_mask = g.ndata.pop('val_mask') test_mask = g.ndata.pop('test_mask') in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() print("""----Data statistics------' #Edges %d #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % (n_edges, n_classes, train_mask.int().sum().item(), val_mask.int().sum().item(), test_mask.int().sum().item())) del features #release memory # add self loop ''' if args.self_loop: g = dgl.remove_self_loop(g) g = dgl.add_self_loop(g) ''' # create GCN model model = MyGCN( in_feats, args.n_hidden, n_classes, args.n_layers, F.relu, args.dropout, ) model = model.to(device) model = DistributedDataParallel( model, device_ids=[device], output_device=device) #device_ids = [device], output_device = device # set sampler fanouts = [] for i in range(args.n_layers): fanouts.append(args.neighbor_number) ''' example: fanout=[2,2,2,2] or [3,3,3] ... ''' sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts) train_nids = args.TV_list[procid] dataloader = dgl.dataloading.NodeDataLoader(g, train_nids, sampler, batch_size=args.batch_size, shuffle=False, drop_last=True, num_workers=0) # set loss function loss_fcn = torch.nn.CrossEntropyLoss() # use optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # initialize graph dur = [] # Sync #if(args.ngpus > 1): # torch.distributed.barrier() #Start trainning model.train() for epoch in range(args.n_epochs): # time record #if epoch >= 3: tS = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] t0 = time.time() # forward #Loss=torch.tensor([0.0],device=device,required_grad=False) for count, (in_nodes, out_nodes, blocks) in enumerate(dataloader): t1 = time.time() blocks = [b.to(device) for b in blocks] t2 = time.time() feat_in = st.Query(fname='feat', nodes=in_nodes) labels_out = st.Query(fname='label', nodes=out_nodes) t3 = time.time() # forward feat_out = model(blocks, feat_in) t4 = time.time() loss = loss_fcn(feat_out, labels_out) #Loss=Loss+loss.detach() t5 = time.time() optimizer.zero_grad() loss.backward() optimizer.step() t6 = time.time() tS[1] = tS[1] + t2 - t1 tS[2] = tS[2] + t3 - t2 tS[3] = tS[3] + t4 - t3 tS[4] = tS[4] + t5 - t4 tS[5] = tS[5] + t6 - t5 tE = time.time() #logits = model(features) #loss = loss_fcn(logits[train_mask], labels[train_mask]) #optimizer.zero_grad() #loss.backward() #optimizer.step() #if epoch >= 3: dur.append(time.time() - t0) acc = 0.0 #evaluate(model, features, labels, val_mask) if (procid >= 0): print( "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(), acc, n_edges / np.mean(dur) / 1000)) #for i in range(1,6): print(tS[1:], '\nTotal:', tE - t0, " s ") #Finish trainning # Sync #if(args.ngpus > 1000): # torch.distributed.barrier() model.eval() time.sleep(3) print("____________________________")
def DistributedFairseqModel(args, model, process_group, device): """ Wrap a *model* to support distributed data parallel training. This is similar to the built-in DistributedDataParallel, but allows additional configuration of the DistributedDataParallel class to use, and also provides easier access to the wrapped model by forwarding requests for missing attributes to the wrapped model. Args: args (argparse.Namespace): fairseq args model (BaseFairseqModel): model to wrap process_group: the c10d process group to be used for distributed data parallel all-reduction. device: device to move model to """ assert isinstance(model, nn.Module) if args.tpu: wrapped_model = TPUDistributedDataParallel( module=model.to(device), process_group=process_group, ) # forward missing getattr and state_dict/load_state_dict to orig model wrapped_model = ModuleProxyWrapper(wrapped_model) elif args.ddp_backend in {"c10d", "pytorch_ddp"}: wrapped_model = DistributedDataParallel( module=model.to(device), device_ids=[args.device_id], output_device=args.device_id, broadcast_buffers=args.broadcast_buffers, bucket_cap_mb=args.bucket_cap_mb, process_group=process_group, find_unused_parameters=args.find_unused_parameters, ) # forward missing getattr and state_dict/load_state_dict to orig model wrapped_model = ModuleProxyWrapper(wrapped_model) elif args.ddp_backend in {"no_c10d", "legacy_ddp"}: wrapped_model = LegacyDistributedDataParallel( module=model.to(device), buffer_size=2**28, process_group=process_group, ) # forward missing getattr and state_dict/load_state_dict to orig model wrapped_model = ModuleProxyWrapper(wrapped_model) elif args.ddp_backend == "slow_mo": if _GOSSIP_DISABLED: raise ImportError( "Cannot find gossip library. Please install from: " "github.com/facebookresearch/stochastic_gradient_push") # The values of slowmo_momentum below were obtained by tuning on the # En-De 16 dataset by training the transformer_wmt_en_de_large model if args.slowmo_momentum is None: if args.distributed_world_size <= 16: args.slowmo_momentum = 0.0 elif args.distributed_world_size <= 32: args.slowmo_momentum = 0.2 elif args.distributed_world_size <= 64: args.slowmo_momentum = 0.5 else: args.slowmo_momentum = 0.6 wrapped_model = gossip.GossipDataParallel( module=model.to(device), device_ids=[args.device_id], output_device=args.device_id, broadcast_buffers=args.broadcast_buffers, nprocs_per_node=args.nprocs_per_node, slowmo_momentum=args.slowmo_momentum, localsgd=(args.slowmo_algorithm == "LocalSGD"), localsgd_frequency=args.localsgd_frequency, ) # forward missing getattr and state_dict/load_state_dict to orig model wrapped_model = ModuleProxyWrapper(wrapped_model) elif args.ddp_backend == "fully_sharded": try: from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP except ImportError: raise ImportError( "Cannot find FullyShardedDataParallel. " "Please install fairscale with: pip install fairscale") assert isinstance(model, FSDP), "expected model to already be wrapped in FSDP" wrapped_model = model if args.memory_efficient_fp16: wrapped_model = wrapped_model.half() if not args.cpu_offload: wrapped_model = wrapped_model.to(device=device) else: raise ValueError("Unknown --ddp-backend: " + args.ddp_backend) # kill hung distributed jobs after a timeout if getattr(args, "heartbeat_timeout", -1) > 0: wrapped_model = DistributedTimeoutWrapper(wrapped_model, timeout=getattr( args, "heartbeat_timeout", -1)) return wrapped_model
def worker(proc_id, gpu_ranks, args, model): """ Args: proc_id: The id of GPU for single GPU mode; The id of process (and GPU) for multiprocessing distributed mode. gpu_ranks: List of ranks of each process. """ set_seed(args.seed) if args.dist_train: rank = gpu_ranks[proc_id] gpu_id = proc_id elif args.single_gpu: rank = None gpu_id = proc_id else: rank = None gpu_id = None if args.dist_train: train_loader = globals()[args.target.capitalize() + "DataLoader"]( args, args.dataset_path, args.batch_size, rank, args.world_size, True) else: train_loader = globals()[args.target.capitalize() + "DataLoader"]( args, args.dataset_path, args.batch_size, 0, 1, True) if gpu_id is not None: torch.cuda.set_device(gpu_id) model.cuda(gpu_id) # Build optimizer. param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * args.warmup, t_total=train_steps) if args.dist_train: # Initialize multiprocessing distributed training environment. dist.init_process_group(backend=args.backend, init_method=args.master_ip, world_size=args.world_size, rank=rank) model = DistributedDataParallel(model, device_ids=[gpu_id]) print("Worker %d is training ... " % rank) else: print("Worker is training ...") globals().get("train_" + args.target)(args, gpu_id, rank, train_loader, model, optimizer, scheduler)
def register(self, *, models, optimizers, criterion=None, schedulers=None): """Registers parameters with Ray SGD and sets up training components. By calling this method to register your models, optimizers, criterion, and schedulers, Ray SGD will automatically handle necessary setup such as GPU/devices, Distributed Data Parallel, and Fp16. The registered components are returned and should be set as instance attributes to access during training/validation. If more than one model, optimizer, or scheduler is passed in, you should implement your own custom training loop. .. code-block:: python class MyTrainingOperator(TrainingOperator): def setup(self, config): model = ... optimizer = ... train_loader = ... val_loader = ... loss = ... self.model, self.optimizer, self.criterion = self.register( models=model, optimizers=optimizer, criterion=loss) # At this point DDP, Cuda, and Fp16 # are set up for all our components. We then use # self.model, self.optimizer, etc. in our training loop. self.register_data(train_loader=train_loader, validation_loader=val_loader) Args: models (torch.nn.Module or Iterable[nn.Module]): Pytorch model or multiple Pytorch models to use for training. If `use_gpu=True` is passed into ``TorchTrainer``, and Cuda is available, models will automatically be placed on GPU. If ``wrap_ddp=True`` is passed into ``TorchTrainer``, models will be wrapped in DDP. If wrap_ddp is False, you should handle DDP for your models in setup. optimizers (torch.optim.Optimizer or Iterable[ torch.optim.Optimizer]): Pytorch optimizer or multiple Pytorch optimizers to use for training. criterion (Callable, optional): Function to return loss metric given features and target. If not provided, must implement a custom training loop. schedulers (torch.optim.lr_scheduler or Iterable[ torch.optim.lr_scheduler], optional): A learning rate scheduler or multiple learning rate schedulers. Returns: Tuple of model, optimizer, criterion if not None, and scheduler if not None. """ return_vals = [] logger.debug("Registering models.") self._original_models = models if not isinstance(self._original_models, Iterable): self._original_models = [self._original_models] assert all( isinstance(model, nn.Module) for model in self._original_models), ( f"All models must be PyTorch models: {self._original_models}.") if self.use_gpu and torch.cuda.is_available(): self._original_models = [ model.cuda() for model in self._original_models ] logger.debug("Registering optimizers.") self._optimizers = optimizers if not isinstance(self._optimizers, Iterable): self._optimizers = [self._optimizers] if schedulers: logger.debug("Registering scheduler.") self._schedulers = schedulers if not isinstance(self._schedulers, Iterable): self._schedulers = [self._schedulers] else: self._schedulers = None if criterion: logger.debug("Registering loss.") self._criterion = criterion if self.use_gpu and torch.cuda.is_available(): if hasattr(self._criterion, "cuda"): self._criterion = self._criterion.cuda() else: self._criterion = None if self.use_fp16 and amp: logger.debug("Setting up Apex.") self._models, self._optimizers = amp.initialize( self._models, self._optimizers, **self._apex_args) self._amp = amp if self._wrap_ddp: logging.debug("Setting up DDP for models.") self._models = [ DistributedDataParallel(model, device_ids=self.device_ids) for model in self._original_models ] else: self._models = self._original_models if len(self._models) == 1: return_vals.append(self._models[0]) else: return_vals.append(self._models) if len(self._optimizers) == 1: return_vals.append(self._optimizers[0]) else: return_vals.append(self._optimizers) if self._criterion is not None: return_vals.append(self._criterion) if self._schedulers is not None: if self.scheduler_step_freq is None: raise ValueError("scheduler_step_freq passed into " "TorchTrainer cannot be None if you " "are registering schedulers. Set this to " "'manual' if you will be manually stepping " "the schedulers.") if len(self._schedulers) == 1: return_vals.append(self._schedulers[0]) else: return_vals.append(self._schedulers) return tuple(return_vals)
def make_model_env(self, gpu, ngpus_per_node): if self.args.distributed: self.args.gpu = self.args.devices[gpu] else: self.args.gpu = 0 if self.args.use_cuda and self.args.distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes self.args.rank = self.args.rank * ngpus_per_node + gpu dist.init_process_group(backend=self.args.dist_backend, init_method=self.args.dist_url, world_size=self.args.world_size, rank=self.args.rank) self.model = DomainQA(self.args.bert_model, self.args.num_classes, self.args.hidden_size, self.args.num_layers, self.args.dropout, self.args.dis_lambda, self.args.concat, self.args.anneal) if self.args.load_model is not None: print("Loading model from ", self.args.load_model) self.model.load_state_dict( torch.load(self.args.load_model, map_location=lambda storage, loc: storage)) if self.args.freeze_bert: for param in self.model.bert.parameters(): param.requires_grad = False max_len = max([len(f) for f in self.features_lst]) num_train_optimization_steps = math.ceil( max_len / self.args.batch_size) * self.args.epochs * len( self.features_lst) qa_params = list(self.model.bert.named_parameters()) + list( self.model.qa_outputs.named_parameters()) dis_params = list(self.model.discriminator.named_parameters()) self.qa_optimizer = get_opt(qa_params, num_train_optimization_steps, self.args) self.dis_optimizer = get_opt(dis_params, num_train_optimization_steps, self.args) if self.args.use_cuda: if self.args.distributed: torch.cuda.set_device(self.args.gpu) self.model.cuda(self.args.gpu) self.args.batch_size = int(self.args.batch_size / ngpus_per_node) self.args.workers = int( (self.args.workers + ngpus_per_node - 1) / ngpus_per_node) self.model = DistributedDataParallel( self.model, device_ids=[self.args.gpu], find_unused_parameters=True) else: self.model.cuda() self.model = DataParallel(self.model, device_ids=self.args.devices) cudnn.benchmark = True
def __init__(self, opt, edge_enhance=True): super(SRGANModel, self).__init__(opt) self.edge_enhance = edge_enhance if opt['dist']: self.rank = torch.distributed.get_rank() else: self.rank = -1 # non dist training train_opt = opt['train'] self.netG = networks.define_G(opt).to(self.device) if opt['dist']: self.netG = DistributedDataParallel( self.netG, device_ids=[torch.cuda.current_device()]) else: self.netG = DataParallel(self.netG) if self.is_train: self.netD = networks.define_D(opt).to(self.device) if opt['dist']: self.netD = DistributedDataParallel( self.netD, device_ids=[torch.cuda.current_device()]) else: self.netD = DataParallel(self.netD) self.netG.train() self.netD.train() if self.is_train: if train_opt['pixel_weight'] > 0: l_pix_type = train_opt['pixel_criterion'] if l_pix_type == 'l1': self.cri_pix = nn.L1Loss().to(self.device) elif l_pix_type == 'l2': self.cri_pix = nn.MSELoss().to(self.device) else: raise NotImplementedError( 'Loss type [{:s}] not recognized.'.format(l_pix_type)) self.l_pix_w = train_opt['pixel_weight'] else: logger.info('Remove pixel loss.') self.cri_pix = None if train_opt['feature_weight'] > 0: l_fea_type = train_opt['feature_criterion'] if l_fea_type == 'l1': self.cri_fea = nn.L1Loss().to(self.device) elif l_fea_type == 'l2': self.cri_fea = nn.MSELoss().to(self.device) else: raise NotImplementedError( 'Loss type [{:s}] not recognized.'.format(l_fea_type)) self.l_fea_w = train_opt['feature_weight'] else: logger.info('Remove feature loss.') self.cri_fea = None if self.cri_fea: # load VGG perceptual loss self.netF = networks.define_F(opt, use_bn=False).to(self.device) if opt['dist']: self.netF = DistributedDataParallel( self.netF, device_ids=[torch.cuda.current_device()]) else: self.netF = DataParallel(self.netF) self.cri_gan = GANLoss(train_opt['gan_type'], 1.0, 0.0).to(self.device) self.l_gan_w = train_opt['gan_weight'] self.D_update_ratio = train_opt['D_update_ratio'] if train_opt[ 'D_update_ratio'] else 1 self.D_init_iters = train_opt['D_init_iters'] if train_opt[ 'D_init_iters'] else 0 self.WGAN_QC_regul = QC_GradientPenaltyLoss() if self.edge_enhance: self.l_edge_w = train_opt['edge_weight'] if train_opt['edge_type'] == 'sobel': self.cril_edge = sobel elif train_opt['edge_type'] == 'canny': self.cril_edge = canny elif train_opt['edge_type'] == 'hednet': self.netEdge = HedNet().cuda() for p in self.netEdge.parameters(): p.requires_grad = False self.cril_edge = self.netEdge else: raise NotImplementedError( 'Loss type [{:s}] not recognized.'.format( train_opt['edge_type'])) else: logger.info('Remove edge loss.') self.cril_edge = None wd_G = train_opt['weight_decay_G'] if train_opt[ 'weight_decay_G'] else 0 optim_params = [] for k, v in self.netG.named_parameters( ): # can optimize for a part of the model if v.requires_grad: optim_params.append(v) else: if self.rank <= 0: logger.warning( 'Params [{:s}] will not optimize.'.format(k)) self.optimizer_G = torch.optim.Adam(optim_params, lr=train_opt['lr_G'], weight_decay=wd_G, betas=(train_opt['beta1_G'], train_opt['beta2_G'])) self.optimizers.append(self.optimizer_G) wd_D = train_opt['weight_decay_D'] if train_opt[ 'weight_decay_D'] else 0 self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=train_opt['lr_D'], weight_decay=wd_D, betas=(train_opt['beta1_D'], train_opt['beta2_D'])) self.optimizers.append(self.optimizer_D) if train_opt['lr_scheme'] == 'MultiStepLR': for optimizer in self.optimizers: self.schedulers.append( lr_scheduler.MultiStepLR_Restart( optimizer, train_opt['lr_steps'], restarts=train_opt['restarts'], weights=train_opt['restart_weights'], gamma=train_opt['lr_gamma'], clear_state=train_opt['clear_state'])) elif train_opt['lr_scheme'] == 'CosineAnnealingLR_Restart': for optimizer in self.optimizers: self.schedulers.append( lr_scheduler.CosineAnnealingLR_Restart( optimizer, train_opt['T_period'], eta_min=train_opt['eta_min'], restarts=train_opt['restarts'], weights=train_opt['restart_weights'])) else: raise NotImplementedError( 'MultiStepLR learning rate scheme is enough.') self.log_dict = OrderedDict() self.load()
def __init__(self, opt): super(SRGANModel, self).__init__(opt) if opt['dist']: self.rank = torch.distributed.get_rank() else: self.rank = -1 # non dist training train_opt = opt['train'] # define networks and load pretrained models self.netG = networks.define_G(opt).to(self.device) if opt['dist']: self.netG = DistributedDataParallel( self.netG, device_ids=[torch.cuda.current_device()]) else: self.netG = DataParallel(self.netG) if self.is_train: self.netD = networks.define_D(opt).to(self.device) if opt['dist']: self.netD = DistributedDataParallel( self.netD, device_ids=[torch.cuda.current_device()]) else: self.netD = DataParallel(self.netD) self.netG.train() self.netD.train() # define losses, optimizer and scheduler if self.is_train: # G pixel loss if train_opt['pixel_weight'] > 0: l_pix_type = train_opt['pixel_criterion'] if l_pix_type == 'l1': self.cri_pix = nn.L1Loss().to(self.device) elif l_pix_type == 'l2': self.cri_pix = nn.MSELoss().to(self.device) else: raise NotImplementedError( 'Loss type [{:s}] not recognized.'.format(l_pix_type)) self.l_pix_w = train_opt['pixel_weight'] else: logger.info('Remove pixel loss.') self.cri_pix = None # G feature loss if train_opt['feature_weight'] > 0: l_fea_type = train_opt['feature_criterion'] if l_fea_type == 'l1': self.cri_fea = nn.L1Loss().to(self.device) elif l_fea_type == 'l2': self.cri_fea = nn.MSELoss().to(self.device) else: raise NotImplementedError( 'Loss type [{:s}] not recognized.'.format(l_fea_type)) self.l_fea_w = train_opt['feature_weight'] else: logger.info('Remove feature loss.') self.cri_fea = None if self.cri_fea: # load VGG perceptual loss self.netF = networks.define_F(opt, use_bn=False).to(self.device) if opt['dist']: pass # do not need to use DistributedDataParallel for netF else: self.netF = DataParallel(self.netF) # GD gan loss self.cri_gan = GANLoss(train_opt['gan_type'], 1.0, 0.0).to(self.device) self.l_gan_w = train_opt['gan_weight'] # D_update_ratio and D_init_iters self.D_update_ratio = train_opt['D_update_ratio'] if train_opt[ 'D_update_ratio'] else 1 self.D_init_iters = train_opt['D_init_iters'] if train_opt[ 'D_init_iters'] else 0 # optimizers # G wd_G = train_opt['weight_decay_G'] if train_opt[ 'weight_decay_G'] else 0 optim_params = [] for k, v in self.netG.named_parameters( ): # can optimize for a part of the model if v.requires_grad: optim_params.append(v) else: if self.rank <= 0: logger.warning( 'Params [{:s}] will not optimize.'.format(k)) self.optimizer_G = torch.optim.Adam(optim_params, lr=train_opt['lr_G'], weight_decay=wd_G, betas=(train_opt['beta1_G'], train_opt['beta2_G'])) self.optimizers.append(self.optimizer_G) # D wd_D = train_opt['weight_decay_D'] if train_opt[ 'weight_decay_D'] else 0 self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=train_opt['lr_D'], weight_decay=wd_D, betas=(train_opt['beta1_D'], train_opt['beta2_D'])) self.optimizers.append(self.optimizer_D) # schedulers if train_opt['lr_scheme'] == 'MultiStepLR': for optimizer in self.optimizers: self.schedulers.append( lr_scheduler.MultiStepLR_Restart( optimizer, train_opt['lr_steps'], restarts=train_opt['restarts'], weights=train_opt['restart_weights'], gamma=train_opt['lr_gamma'], clear_state=train_opt['clear_state'])) elif train_opt['lr_scheme'] == 'CosineAnnealingLR_Restart': for optimizer in self.optimizers: self.schedulers.append( lr_scheduler.CosineAnnealingLR_Restart( optimizer, train_opt['T_period'], eta_min=train_opt['eta_min'], restarts=train_opt['restarts'], weights=train_opt['restart_weights'])) else: raise NotImplementedError( 'MultiStepLR learning rate scheme is enough.') self.log_dict = OrderedDict() self.print_network() # print network self.load() # load G and D if needed if self.opt['use_wandb_logger'] and 'debug' not in self.opt['name']: wandb.watch(self.netG) wandb.watch(self.netD)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train', type=Path, required=True) parser.add_argument('--dev', type=Path, required=True) parser.add_argument('--model-name', type=str, required=True) parser.add_argument('--ckpt', type=str, default='ckpt') parser.add_argument('--batch_size', type=int, default=8) parser.add_argument('--epochs', type=int, default=3) parser.add_argument('--lr', type=float, default=1e-4) parser.add_argument('-a', '--accumulation_steps', type=int, default=1) parser.add_argument('--neg_sample', type=bool, default=False) parser.add_argument('--fp16', action='store_true') # Automatically supplied by torch.distributed.launch parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() logging.basicConfig(level=logging.INFO) # Based on code from SciFact : class FeverLabelPredictionDataset(Dataset): def __init__(self, file): claims, rationales, labels = self._read(file) self._claims = claims self._rationales = rationales self._labels = labels def _read(self, file): claims = [] rationales = [] labels = [] #labels = {'SUPPORTS': 2, 'NOT ENOUGH INFO': 1, 'REFUTES': 0} # From SciFact label_idx = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT ENOUGH INFO': 2} # To Match COVIDLies for data in jsonlines.open(file): if data['label'] == 'NOT ENOUGH INFO': if data['sentences']: indices = sorted(random.sample(range(len(data['sentences'])), k=1)) sentences = [data['sentences'][i] for i in indices] claims.append(data['claim']) rationales.append(' '.join(sentences)) labels.append(label_idx['NOT ENOUGH INFO']) else: for evidence_set in data['evidence_sets']: claims.append(data['claim']) rationales.append(' '.join([data['sentences'][i] for i in evidence_set])) labels.append(label_idx[data['label']]) if args.neg_sample: # Add negative samples non_evidence_indices = set(range(len(data['sentences']))) - set( s for es in data['evidence_sets'] for s in es) if non_evidence_indices: non_evidence_indices = random.sample(non_evidence_indices, k=random.randint(1, min(1, len(non_evidence_indices)))) sentences = [data['sentences'][i] for i in non_evidence_indices] claims.append(data['claim']) rationales.append(' '.join(sentences)) labels.append(label_idx['NOT ENOUGH INFO']) return claims, rationales, labels def __len__(self): return len(self._labels) def __getitem__(self, index): claim = self._claims[index] rationale = self._rationales[index] label = self._labels[index] return claim, rationale, label # Additional janky distributed stuff args.distributed = False world_size = int(os.environ.get('WORLD_SIZE', 1)) args.distributed = world_size > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info('Loading training data') train_dataset = FeverLabelPredictionDataset(args.train) train_dataloader = DataLoader( train_dataset, batch_size=args.batch_size, sampler=get_sampler(train_dataset, world_size, args.local_rank) ) logger.info('Loading dev data') dev_dataset = FeverLabelPredictionDataset(args.dev) dev_dataloader = DataLoader( dev_dataset, batch_size=args.batch_size, sampler=get_sampler(dev_dataset, world_size, args.local_rank), shuffle=False # Seems weird but the HuggingFace guys do it so... ) model = SentenceBertClassifier(model_name=args.model_name, num_classes=3).cuda() optimizer = transformers.AdamW(model.parameters(), lr=args.lr) if args.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') if args.distributed: model = DistributedDataParallel(model) loss_fn = torch.nn.CrossEntropyLoss() # Do we need to ignore padding? for epoch in range(args.epochs): logger.info(f'Epoch: {epoch}') logger.info('Training...') model.train() if args.local_rank == 0: iterable = tqdm(train_dataloader) else: iterable = train_dataloader for i, (claims, rationales, labels) in enumerate(iterable): if not i % args.accumulation_steps: optimizer.step() optimizer.zero_grad() logits = model(claims, rationales) _, preds = logits.max(dim=-1) labels = torch.tensor(labels).cuda() acc = (preds == labels).float().mean() loss = loss_fn(logits, labels) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.local_rank == 0: iterable.set_description(f'Loss: {loss : 0.4f} - Acc: {acc : 0.4f}') logger.info('Evaluating...') model.eval() correct = 0. total = 0. if args.local_rank == 0: iterable = tqdm(dev_dataloader) else: iterable = dev_dataloader for claims, rationales, labels in iterable: with torch.no_grad(): logits = model(claims, rationales) _, preds = logits.max(dim=-1) labels = torch.tensor(labels).cuda() correct += (preds == labels).float().sum() total += labels.size(0) if args.local_rank == 0: acc = correct / total iterable.set_description(f'Accuracy: {acc.item() : 0.4f}') logger.info('Saving...') if args.local_rank == 0: torch.save(model.state_dict(), f'{args.ckpt}-{epoch}.pt')
def __init__( self, model: Model, optimizer: torch.optim.Optimizer, iterator: DataIterator, train_dataset: Iterable[Instance], validation_dataset: Optional[Iterable[Instance]] = None, patience: Optional[int] = None, validation_metric: str = "-loss", validation_iterator: DataIterator = None, shuffle: bool = True, num_epochs: int = 20, serialization_dir: Optional[str] = None, num_serialized_models_to_keep: int = 20, keep_serialized_model_every_num_seconds: int = None, checkpointer: Checkpointer = None, model_save_interval: float = None, cuda_device: int = -1, grad_norm: Optional[float] = None, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[LearningRateScheduler] = None, momentum_scheduler: Optional[MomentumScheduler] = None, summary_interval: int = 100, histogram_interval: int = None, should_log_parameter_statistics: bool = True, should_log_learning_rate: bool = False, log_batch_size_period: Optional[int] = None, moving_average: Optional[MovingAverage] = None, distributed: bool = False, rank: int = 0, world_size: int = 1, num_gradient_accumulation_steps: int = 1, ) -> None: """ A trainer for doing supervised learning. It just takes a labeled dataset and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights for your model over some fixed number of epochs. You can also pass in a validation dataset and enable early stopping. There are many other bells and whistles as well. Parameters ---------- model : ``Model``, required. An AllenNLP model to be optimized. Pytorch Modules can also be optimized if their ``forward`` method returns a dictionary with a "loss" key, containing a scalar tensor representing the loss function to be optimized. If you are training your model using GPUs, your model should already be on the correct device. (If you use `Trainer.from_params` this will be handled for you.) optimizer : ``torch.nn.Optimizer``, required. An instance of a Pytorch Optimizer, instantiated with the parameters of the model to be optimized. iterator : ``DataIterator``, required. A method for iterating over a ``Dataset``, yielding padded indexed batches. train_dataset : ``Dataset``, required. A ``Dataset`` to train on. The dataset should have already been indexed. validation_dataset : ``Dataset``, optional, (default = None). A ``Dataset`` to evaluate on. The dataset should have already been indexed. patience : Optional[int] > 0, optional (default=None) Number of epochs to be patient before early stopping: the training is stopped after ``patience`` epochs with no improvement. If given, it must be ``> 0``. If None, early stopping is disabled. validation_metric : str, optional (default="loss") Validation metric to measure for whether to stop training using patience and whether to serialize an ``is_best`` model each epoch. The metric name must be prepended with either "+" or "-", which specifies whether the metric is an increasing or decreasing function. validation_iterator : ``DataIterator``, optional (default=None) An iterator to use for the validation set. If ``None``, then use the training `iterator`. shuffle: ``bool``, optional (default=True) Whether to shuffle the instances in the iterator or not. num_epochs : int, optional (default = 20) Number of training epochs. serialization_dir : str, optional (default=None) Path to directory for saving and loading model files. Models will not be saved if this parameter is not passed. num_serialized_models_to_keep : ``int``, optional (default=20) Number of previous model checkpoints to retain. Default is to keep 20 checkpoints. A value of None or -1 means all checkpoints will be kept. keep_serialized_model_every_num_seconds : ``int``, optional (default=None) If num_serialized_models_to_keep is not None, then occasionally it's useful to save models at a given interval in addition to the last num_serialized_models_to_keep. To do so, specify keep_serialized_model_every_num_seconds as the number of seconds between permanently saved checkpoints. Note that this option is only used if num_serialized_models_to_keep is not None, otherwise all checkpoints are kept. checkpointer : ``Checkpointer``, optional (default=None) An instance of class Checkpointer to use instead of the default. If a checkpointer is specified, the arguments num_serialized_models_to_keep and keep_serialized_model_every_num_seconds should not be specified. The caller is responsible for initializing the checkpointer so that it is consistent with serialization_dir. model_save_interval : ``float``, optional (default=None) If provided, then serialize models every ``model_save_interval`` seconds within single epochs. In all cases, models are also saved at the end of every epoch if ``serialization_dir`` is provided. cuda_device : ``int``, optional (default = -1) An integer specifying the CUDA device(s) to use for this process. If -1, the CPU is used. Data parallelism is controlled at the allennlp train level, so each trainer will have a single GPU. grad_norm : ``float``, optional, (default = None). If provided, gradient norms will be rescaled to have a maximum of this value. grad_clipping : ``float``, optional (default = ``None``). If provided, gradients will be clipped `during the backward pass` to have an (absolute) maximum of this value. If you are getting ``NaNs`` in your gradients during training that are not solved by using ``grad_norm``, you may need this. learning_rate_scheduler : ``LearningRateScheduler``, optional (default = None) If specified, the learning rate will be decayed with respect to this schedule at the end of each epoch (or batch, if the scheduler implements the ``step_batch`` method). If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`, this will use the ``validation_metric`` provided to determine if learning has plateaued. To support updating the learning rate on every batch, this can optionally implement ``step_batch(batch_num_total)`` which updates the learning rate given the batch number. momentum_scheduler : ``MomentumScheduler``, optional (default = None) If specified, the momentum will be updated at the end of each batch or epoch according to the schedule. summary_interval: ``int``, optional, (default = 100) Number of batches between logging scalars to tensorboard histogram_interval : ``int``, optional, (default = ``None``) If not None, then log histograms to tensorboard every ``histogram_interval`` batches. When this parameter is specified, the following additional logging is enabled: * Histograms of model parameters * The ratio of parameter update norm to parameter norm * Histogram of layer activations We log histograms of the parameters returned by ``model.get_parameters_for_histogram_tensorboard_logging``. The layer activations are logged for any modules in the ``Model`` that have the attribute ``should_log_activations`` set to ``True``. Logging histograms requires a number of GPU-CPU copies during training and is typically slow, so we recommend logging histograms relatively infrequently. Note: only Modules that return tensors, tuples of tensors or dicts with tensors as values currently support activation logging. should_log_parameter_statistics : ``bool``, optional, (default = True) Whether to send parameter statistics (mean and standard deviation of parameters and gradients) to tensorboard. should_log_learning_rate : ``bool``, optional, (default = False) Whether to send parameter specific learning rate to tensorboard. log_batch_size_period : ``int``, optional, (default = ``None``) If defined, how often to log the average batch size. moving_average: ``MovingAverage``, optional, (default = None) If provided, we will maintain moving averages for all parameters. During training, we employ a shadow variable for each parameter, which maintains the moving average. During evaluation, we backup the original parameters and assign the moving averages to corresponding parameters. Be careful that when saving the checkpoint, we will save the moving averages of parameters. This is necessary because we want the saved model to perform as well as the validated model if we load it later. But this may cause problems if you restart the training from checkpoint. distributed: ``bool``, optional, (default = False) If set, PyTorch's `DistributedDataParallel` is used to train the model in multiple GPUs. This also requires `world_size` to be greater than 1. rank: ``int``, optional, (default = 0) This is the unique identifier of the `Trainer` in a distributed process group. The GPU device id is used as the rank. world_size: ``int``, (default = 1) The number of `Trainer` workers participating in the distributed training. num_gradient_accumulation_steps: ``int``, optional, (default = 1) Gradients are accumulated for the given number of steps before doing an optimizer step. This can be useful to accommodate batches that are larger than the RAM size. Refer Thomas Wolf's [post](https://tinyurl.com/y5mv44fw) for details on Gradient Accumulation. """ super().__init__(serialization_dir, cuda_device, distributed, rank, world_size) # I am not calling move_to_gpu here, because if the model is # not already on the GPU then the optimizer is going to be wrong. self.model = model self.iterator = iterator self._validation_iterator = validation_iterator self.shuffle = shuffle self.optimizer = optimizer self.train_data = train_dataset self._validation_data = validation_dataset if patience is None: # no early stopping if validation_dataset: logger.warning( "You provided a validation dataset but patience was set to None, " "meaning that early stopping is disabled") elif (not isinstance(patience, int)) or patience <= 0: raise ConfigurationError( '{} is an invalid value for "patience": it must be a positive integer ' "or None (if you want to disable early stopping)".format( patience)) # For tracking is_best_so_far and should_stop_early self._metric_tracker = MetricTracker(patience, validation_metric) # Get rid of + or - self._validation_metric = validation_metric[1:] self._num_epochs = num_epochs if checkpointer is not None: # We can't easily check if these parameters were passed in, so check against their default values. # We don't check against serialization_dir since it is also used by the parent class. if (num_serialized_models_to_keep != 20 or keep_serialized_model_every_num_seconds is not None): raise ConfigurationError( "When passing a custom Checkpointer, you may not also pass in separate checkpointer " "args 'num_serialized_models_to_keep' or 'keep_serialized_model_every_num_seconds'." ) self._checkpointer = checkpointer else: self._checkpointer = Checkpointer( serialization_dir, keep_serialized_model_every_num_seconds, num_serialized_models_to_keep, ) self._model_save_interval = model_save_interval self._grad_norm = grad_norm self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler self._momentum_scheduler = momentum_scheduler self._moving_average = moving_average # We keep the total batch number as an instance variable because it # is used inside a closure for the hook which logs activations in # ``_enable_activation_logging``. self._batch_num_total = 0 self._tensorboard = TensorboardWriter( get_batch_num_total=lambda: self._batch_num_total, serialization_dir=serialization_dir, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, ) self._log_batch_size_period = log_batch_size_period self._last_log = 0.0 # time of last logging self._num_gradient_accumulation_steps = num_gradient_accumulation_steps # Enable activation logging. if histogram_interval is not None: self._tensorboard.enable_activation_logging(self.model) # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model` # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc. # # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the # normal case, reference to `Model` is retained. This reference is only used in # these places: `model.__call__`, `model.train` and `model.eval`. if self._distributed: self._pytorch_model = DistributedDataParallel( self.model, device_ids=[self.cuda_device]) else: self._pytorch_model = self.model
def train_func(config): is_distributed = config.get("is_distributed", False) use_gpu = config["use_gpu"] num_epochs = config["num_epochs"] batch_size = config["batch_size"] num_layers = config["num_layers"] num_hidden = config["num_hidden"] dropout_every = config["dropout_every"] dropout_prob = config["dropout_prob"] num_features = config["num_features"] print("Defining model, loss, and optimizer...") # Setup device. if is_distributed: device = torch.device(f"cuda:{train.local_rank()}" if use_gpu and torch.cuda.is_available() else "cpu") else: device = torch.device( "cuda:0" if use_gpu and torch.cuda.is_available() else "cpu") print(f"Device: {device}") # Setup data. if is_distributed: train_dataset_pipeline = train.get_dataset_shard("train_dataset") train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs() test_dataset = train.get_dataset_shard("test_dataset") else: train_dataset_epoch_iterator = config["train_dataset"].iter_epochs() test_dataset = config["test_dataset"] test_torch_dataset = test_dataset.to_torch(label_column="label", batch_size=batch_size) # Setup Tensorboard and MLflow. if is_distributed: # Setup is done through Callback. pass else: writer = SummaryWriter() mlflow.start_run() mlflow_config = config.copy() mlflow_config.pop("test_dataset") mlflow_config.pop("train_dataset") mlflow.log_params(mlflow_config) net = Net( n_layers=num_layers, n_features=num_features, num_hidden=num_hidden, dropout_every=dropout_every, drop_prob=dropout_prob, ).to(device) print(net.parameters) if is_distributed: net = DistributedDataParallel(net) criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(net.parameters(), weight_decay=0.0001) print("Starting training...") for epoch in range(num_epochs): train_dataset = next(train_dataset_epoch_iterator) train_torch_dataset = train_dataset.to_torch(label_column="label", batch_size=batch_size) train_running_loss, train_num_correct, train_num_total = train_epoch( train_torch_dataset, net, device, criterion, optimizer) train_acc = train_num_correct / train_num_total print( f"epoch [{epoch + 1}]: training accuracy: {train_num_correct} / {train_num_total} = {train_acc:.4f}" ) test_running_loss, test_num_correct, test_num_total = test_epoch( test_torch_dataset, net, device, criterion) test_acc = test_num_correct / test_num_total print( f"epoch [{epoch + 1}]: testing accuracy: {test_num_correct} / {test_num_total} = {test_acc:.4f}" ) # Record and log stats. if is_distributed: train.report(train_acc=train_acc, train_loss=train_running_loss, test_acc=test_acc, test_loss=test_running_loss) else: writer.add_scalar("Accuracy/train", train_acc, epoch) writer.add_scalar("Loss/train", train_running_loss, epoch) writer.add_scalar("Accuracy/test", test_acc, epoch) writer.add_scalar("Loss/test", test_running_loss, epoch) writer.flush() mlflow.log_metrics({ "train_acc": train_acc, "train_loss": train_running_loss, "test_acc": test_acc, "test_loss": test_running_loss }) # Checkpoint model. if is_distributed: import copy model_copy = copy.deepcopy(net.module) train.save_checkpoint( model_state_dict=model_copy.cpu().state_dict()) else: torch.save(net.state_dict(), f"models/model-epoch-{epoch}.torch") # Shutdown Tensorboard and MLflow. if is_distributed: pass else: writer.close() # mlflow.end_run() if is_distributed: if train.world_rank() == 0: return net.module.cpu() else: return None else: return net
def run(proc_id, n_gpus, args, devices, data): dropout = 0.2 dev_id = devices[proc_id] if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus th.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=proc_id) th.cuda.set_device(dev_id) # Unpack data train_mask, val_mask, in_feats, labels, n_classes, g = data train_nid = train_mask.nonzero().squeeze() val_nid = val_mask.nonzero().squeeze() # Split train_nid train_nid = th.split(train_nid, math.ceil(len(train_nid) / n_gpus))[proc_id] # Create sampler sampler = NeighborSampler(g, [int(_) for _ in args.fan_out.split(',')]) # Create PyTorch DataLoader for constructing blocks dataloader = DataLoader(dataset=train_nid.numpy(), batch_size=args.batch_size, collate_fn=sampler.sample_blocks, shuffle=True, drop_last=False, num_workers=args.num_workers_per_gpu) # Define model model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu) # Move the model to GPU and define optimizer model = model.to(dev_id) if n_gpus > 1: model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) loss_fcn = nn.CrossEntropyLoss() loss_fcn = loss_fcn.to(dev_id) optimizer = optim.Adam(model.parameters(), lr=args.lr) # Compute history tensor and their aggregation before training on CPU model.eval() if n_gpus > 1: if proc_id == 0: init_history(g, model.module, dev_id, args.val_batch_size) th.distributed.barrier() else: init_history(g, model, dev_id, args.val_batch_size) model.train() # Training loop avg = 0 iter_tput = [] for epoch in range(args.num_epochs): tic = time.time() model.train() for step, (blocks, hist_blocks) in enumerate(dataloader): if proc_id == 0: tic_step = time.time() # The nodes for input lies at the LHS side of the first block. # The nodes for output lies at the RHS side of the last block. seeds = blocks[-1].dstdata[dgl.NID] blocks, hist_blocks = load_subtensor(g, labels, blocks, hist_blocks, dev_id, True) # forward batch_pred = model(blocks) # update history update_history(g, blocks) # compute loss batch_labels = blocks[-1].dstdata['label'] loss = loss_fcn(batch_pred, batch_labels) # backward optimizer.zero_grad() loss.backward() optimizer.step() if proc_id == 0: iter_tput.append( len(seeds) * n_gpus / (time.time() - tic_step)) if step % args.log_every == 0 and proc_id == 0: acc = compute_acc(batch_pred, batch_labels) print( 'Epoch {:05d} | Step {:05d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f}' .format(epoch, step, loss.item(), acc.item(), np.mean(iter_tput[3:]))) if n_gpus > 1: th.distributed.barrier() toc = time.time() if proc_id == 0: print('Epoch Time(s): {:.4f}'.format(toc - tic)) if epoch >= 5: avg += toc - tic if epoch % args.eval_every == 0 and epoch != 0: model.eval() eval_acc = evaluate(model if n_gpus == 1 else model.module, g, labels, val_nid, args.val_batch_size, dev_id) print('Eval Acc {:.4f}'.format(eval_acc)) if n_gpus > 1: th.distributed.barrier() if proc_id == 0: print('Avg epoch time: {}'.format(avg / (epoch - 4)))
ARGS_RESET_EVERY = 100 print("Loading {} for {}".format(params['model'].get('type', 'WTF?'), args.mode), flush=True) model = Model.from_params(params=params['model']) for submodule in model.trunk.detector.backbone.modules(): if isinstance(submodule, BatchNorm2d): submodule.track_running_stats = False for p in submodule.parameters(): p.requires_grad = False if distributed: model.cuda() model = DistributedDataParallel(model) elif NUM_GPUS > 1: model = DataParallel(model).cuda() else: model.cuda() optimizer = Optimizer.from_params( [x for x in model.named_parameters() if x[1].requires_grad], params['trainer']['optimizer']) lr_scheduler_params = params['trainer'].pop("learning_rate_scheduler", None) scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) if lr_scheduler_params else None if os.path.exists(args.folder): print("Found folder! restoring", flush=True)
def train(): parser = ArgumentParser() parser.add_argument('--gpt2', action='store_true', help="use gpt2") parser.add_argument("--model_checkpoint", type=str, default="config/cgpt/", help="Path or URL of the model") parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step") parser.add_argument('--pretrained', action='store_true', help="If False train from scratch") parser.add_argument("--data_path", type=str, default="", help="Path or url of the dataset. ") parser.add_argument("--train_path", type=str, default="data/toy_train.txt", help="Path of the train dataset for dist dataset. ") parser.add_argument("--valid_path", type=str, default="data/toy_valid.txt", help="Path of the valid dataset for dist dataset. ") parser.add_argument("--dataset_cache", type=str, default="dataset_cache", help="Path or url of the dataset cache") parser.add_argument('--log_file', '-log_file', type=str, default="", help="Output logs to a file under this path") parser.add_argument("--num_workers", type=int, default=8, help="Number of subprocesses for data loading") parser.add_argument("--n_epochs", type=int, default=70, help="Number of training epochs") parser.add_argument("--train_batch_size", type=int, default=2, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=2, help="Batch size for validation") parser.add_argument("--max_history", type=int, default=15, help="Number of previous exchanges to keep in history") parser.add_argument("--scheduler", type=str, default="noam", choices=['noam', 'linear'], help="method of optim") parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)") parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps") parser.add_argument("--valid_steps", type=int, default=5000, help="Perfom validation every X steps") parser.add_argument("--gradient_accumulation_steps", type=int, default=64, help="Accumulate gradients on several steps") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. # logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) model_class = OpenAIGPTLMHeadModel if not args.gpt2 else GPT2LMHeadModel config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config tokenizer_class = BertTokenizer if args.pretrained: tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint, do_lower_case=True) model = model_class.from_pretrained(args.model_checkpoint) else: tokenizer = tokenizer_class(os.path.join(args.model_checkpoint, "vocab.txt"), do_lower_case=True) config = config_class.from_json_file( os.path.join(args.model_checkpoint, CONFIG_NAME)) model = model_class(config) model.to(args.device) optimizer = AdamW([{ 'params': model.parameters(), 'initial_lr': args.lr }], lr=args.lr, correct_bias=True) logger.info("Prepare datasets") loader_class = build_dist_loaders if not args.data_path else build_dataloaders train_loader, val_loader, train_sampler, valid_sampler = loader_class( args, tokenizer, logger) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) # Training function and trainer def update(engine, batch): input_ids, token_type_ids, lm_labels = tuple( input_tensor.to(args.device) for input_tensor in batch) model.train() (lm_loss), *_ = model(input_ids, labels=lm_labels, token_type_ids=token_type_ids) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item(), optimizer.param_groups[0]['lr'] trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): input_ids, token_type_ids, lm_labels = tuple( input_tensor.to(args.device) for input_tensor in batch) # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Evaluation during training @trainer.on(Events.ITERATION_STARTED) def log_iterations(engine): # if engine.state.iteration % max(int(0.1 * len(train_loader)), 1) == 0: if engine.state.iteration % args.valid_steps == 0: evaluator.run(val_loader) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # noam decrease the learning rate # model_size = model.config.n_embd model_size = args.n_emd noam_lambda = lambda step: (model_size**(-0.5) * min( (step + 1)**(-0.5), (step + 1) * args.warmup_steps**(-1.5))) noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step) scheduler = LRScheduler(noam_scheduler) if args.scheduler == "linear": scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss") RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lr") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0], x[1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints # And save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True, mininterval=2) pbar.attach(trainer, metric_names=["loss", "lr"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=3) # save model after evaluation evaluator.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.logdir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.logdir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint # (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def main(args): print('_' * 60 + f'\nmain <- {args}') if 'setup(args)': cfg = get_cfg() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() default_setup( cfg, args ) # if you don't like any of the default setup, write your own setup code global CONFIG CONFIG = cfg if True: # N_GPU > 0: # __________________ For Debug _____________________________ # mem_stats_df.record('Before-Build-Model') if 'build_model(cfg)': meta_arch = cfg.MODEL.META_ARCHITECTURE model = META_ARCH_REGISTRY.get(meta_arch)(cfg) # for param in model.backbone.parameters(): # param.requires_grad = False model.to(torch.device(cfg.MODEL.DEVICE)) # __________________ For Debug _____________________________ # mem_stats_df.record('After-Build-Model') if args.eval_only: DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) if 'do-train': dataloader = build_train_dataloader(cfg) if N_GPUS > 0: cfg, model, resume = cfg, model, args.resume model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler, ) # "iteration" always be loaded whether resume or not. # "model" state_dict will always be loaded whether resume or not. start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER # optimizer and scheduler will be resume to checkpointer.checkpointables[*] if resume is True if resume: optimizer = checkpointer.checkpointables['optimizer'] scheduler = checkpointer.checkpointables['scheduler'] else: start_iter = 0 periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, itr in zip(dataloader, range(start_iter, max_iter)): iteration = itr + 1 storage.step() loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() # __________________ Checkpoint / Test / Metrics ___________ periodic_checkpointer.step(iteration) if ( cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter ): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 100 == 0 or iteration == max_iter): for writer in writers: writer.write() # __________________ For Debug _____________________________ # mem_summary = torch.cuda.memory_summary() # tcp_sock.send(mem_summary.encode('utf-8')) global TIC if TIC is None: TIC = datetime.datetime.now() else: toc = datetime.datetime.now() logger.info('_' * 35 + f'Time Elapsed: {(toc - TIC).total_seconds()} s') TIC = toc