torch.cuda.manual_seed(const.SEED) torch.backends.cudnn.deterministic = True model = Transformer( source_vocab_size=SOURCE_VOCAB_SIZE, target_vocab_size=TARGET_VOCAB_SIZE, source_padding_index=SRC_PAD_IDX, target_padding_index=TRG_PAD_IDX, embedding_size=const.EMBEDDING_SIZE, number_of_layers=const.NUMBER_OF_LAYERS, number_of_heads=const.NUMBER_OF_HEADS, forward_expansion=const.FORWARD_EXPANSION, device=device, ).to(device) model.apply(model_utils.initialize_weights) optimizer = torch.optim.Adam(model.parameters(), lr=const.LEARNING_RATE) cross_entropy = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) print(f'The model has {model_utils.count_parameters(model):,} trainable parameters') trainer = Trainer( const=const, optimizer=optimizer, criterion=cross_entropy, device=device, ) trainer.train( model=model, train_iterator=train_iterator,
model = Transformer(src_pad_idx=src_pad_idx, trg_pad_idx=trg_pad_idx, trg_sos_idx=trg_sos_idx, d_model=d_model, enc_voc_size=enc_voc_size, dec_voc_size=dec_voc_size, max_len=max_len, ffn_hidden=ffn_hidden, n_head=n_head, n_layers=n_layers, drop_prob=drop_prob, device=device).to(device) model.apply(initialize_weights) optimizer = Adam(params=model.parameters(), lr=init_lr, weight_decay=weight_decay, eps=adam_eps) criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx) def train(model, iterator, optimizer, criterion, clip): model.train() epoch_loss = 0 for i, batch in enumerate(iterator): src = batch.src trg = batch.trg