def train(rank, args): print(f"Running basic DDP example on rank {rank} {args.master_port}.") setup(rank, args.world_size, args.master_port) args.local_rank = rank torch.manual_seed(args.seed) torch.cuda.set_device(rank) src_vocab = Dictionary.read_vocab(args.vocab_src) tgt_vocab = Dictionary.read_vocab(args.vocab_tgt) batch_size = args.batch_size # model init model = TransformerModel(d_model=args.d_model, nhead=args.nhead, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, dropout=args.dropout, attention_dropout=args.attn_dropout, src_dictionary=src_vocab, tgt_dictionary=tgt_vocab) model.to(rank) model = DDP(model, device_ids=[rank]) if rank == 0: print(model) print('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # data load train_loader = dataloader.get_train_parallel_loader(args.train_src, args.train_tgt, src_vocab, tgt_vocab, batch_size=batch_size) valid_loader = dataloader.get_valid_parallel_loader(args.valid_src, args.valid_tgt, src_vocab, tgt_vocab, batch_size=batch_size) data = {'dataloader': {'train': train_loader, 'valid': valid_loader}} trainer = Trainer(model, data, args) for epoch in range(1, args.max_epoch): trainer.mt_step(epoch) trainer.evaluate(epoch) trainer.save_checkpoint(epoch)
def on_init(self, params, p_params): dump_path = os.path.join(params.dump_path, "debias") checkpoint_path = os.path.join(dump_path, "checkpoint.pth") if os.path.isfile(checkpoint_path): self.params.dump_path = dump_path self.checkpoint_path = checkpoint_path self.from_deb = True else: self.checkpoint_path = os.path.join(params.dump_path, "checkpoint.pth") self.from_deb = False deb = TransformerModel(p_params, self.model.dico, is_encoder=True, with_output=False, with_emb=False) #deb = LinearDeb(p_params) self.deb = deb.to(params.device)