Ejemplo n.º 1
0
class TransformerHandler():
    def __init__(self,
                 encoder_vocab: list,
                 decoder_vocab: list,
                 decoder_sos_idx: int,
                 decoder_eos_idx: int = None):
        super(TransformerHandler, self).__init__()
        self.input = encoder_vocab
        self.output = decoder_vocab
        self.encoder_dim = len(encoder_vocab)
        self.decoder_dim = len(decoder_vocab)
        self.decoder_sos_idx = decoder_sos_idx
        self.decoder_pad_idx = decoder_vocab.index(PAD)
        self.encoder_pad_idx = encoder_vocab.index(PAD)

        if decoder_eos_idx is None:
            self.decoder_eos_idx = decoder_vocab.index(EOS)
        else:
            self.decoder_eos_idx = decoder_eos_idx
        self.transformer = Transformer(self.encoder_dim, self.decoder_dim,
                                       self.encoder_pad_idx,
                                       self.decoder_pad_idx)

    def forward(self, src: torch.Tensor, trg: torch.Tensor):
        src_mask = self.get_pad_mask(src, self.encoder_pad_idx)
        trg_mask = self.get_pad_mask(trg, self.decoder_pad_idx)
        output = self.transformer.forward(src.unsqueeze(1), trg.unsqueeze(1),
                                          src_mask, trg_mask)

        return output

    def get_pad_mask(self, seq, pad_idx):
        return (seq != pad_idx)
Ejemplo n.º 2
0
    def __init__(self,
                 encoder_vocab: list,
                 decoder_vocab: list,
                 decoder_sos_idx: int,
                 decoder_eos_idx: int = None):
        super(TransformerHandler, self).__init__()
        self.input = encoder_vocab
        self.output = decoder_vocab
        self.encoder_dim = len(encoder_vocab)
        self.decoder_dim = len(decoder_vocab)
        self.decoder_sos_idx = decoder_sos_idx
        self.decoder_pad_idx = decoder_vocab.index(PAD)
        self.encoder_pad_idx = encoder_vocab.index(PAD)

        if decoder_eos_idx is None:
            self.decoder_eos_idx = decoder_vocab.index(EOS)
        else:
            self.decoder_eos_idx = decoder_eos_idx
        self.transformer = Transformer(self.encoder_dim, self.decoder_dim,
                                       self.encoder_pad_idx,
                                       self.decoder_pad_idx)
Ejemplo n.º 3
0
def main(args):
    #===========================================================================
    # Set the file name format
    FILE_NAME_FORMAT = "{0}_{1}_{2:d}_{3:d}_{4:d}_{5:d}_{6:d}_{7:d}{8}".format(
                                    args.model, args.dataset,
                                    args.batch_size, args.dim_model,
                                    args.dim_ff, args.dim_KV, args.num_layers,
                                    args.num_heads, args.flag)

    # Set the results file path
    RESULT_FILE_NAME = FILE_NAME_FORMAT+'_results.pkl'
    RESULT_FILE_PATH = os.path.join(RESULTS_PATH, RESULT_FILE_NAME)
    # Set the checkpoint file path
    CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'.ckpt'
    CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, CHECKPOINT_FILE_NAME)
    BEST_CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'_best.ckpt'
    BEST_CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH,
                                                BEST_CHECKPOINT_FILE_NAME)

    # Set the random seed same for reproducibility
    random.seed(190811)
    torch.manual_seed(190811)
    torch.cuda.manual_seed_all(190811)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Setting constants
    dim_model = args.dim_model      # Dimension of model (=Embedding size)
    dim_ff = args.dim_ff            # Dimension of FeedForward
    dim_K = args.dim_KV             # Dimension of Key(=Query)
    dim_V = args.dim_KV             # Dimension of Value
    num_layers = args.num_layers    # Number of Encoder of Decoder Layer
    num_heads = args.num_heads      # Number of heads in Multi-Head Attention
    dropout_p = args.dropout_p      # Dropout probability
    warmup_steps = 4000             # Warming up learnimg rate steps
    label_smoothing_eps = 0.1       # Label smoothing epsilon
    max_src_len = 46                # Maximum source input length (Multi30k)
    max_trg_len = 45                # Maximum target input length (Multi30k)

    # Step1 ====================================================================
    # Load dataset
    if args.dataset == 'WMT2014':
        dataloader = WMT2014_Dataloader()
    elif args.dataset == 'Multi30k':
        dataloader = Multi30k_Dataloader()
    else:
        assert False, "Please select the proper dataset."

    train_loader = dataloader.get_train_loader(batch_size=args.batch_size)
    val_loader = dataloader.get_val_loader(batch_size=args.batch_size)
    print('==> DataLoader ready.')

    # Step2 ====================================================================
    # Make Translation model
    if args.model == 'Transformer':
        src_vocab_size = len(dataloader.SRC.vocab)
        trg_vocab_size = len(dataloader.TRG.vocab)
        model = Transformer(src_vocab_size, trg_vocab_size,
                            max_src_len, max_trg_len, dim_model, dim_K,
                            num_layers, num_heads, dim_ff, dropout_p)
    else:
        assert False, "Please select the proper model."

    # Check DataParallel available
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    # Check CUDA available
    if torch.cuda.is_available():
        model.cuda()

    print('==> Model ready.')

    # Step3 ====================================================================
    # Set loss function and optimizer (+ lrate scheduler)
    if args.smoothing:
        criterion = Criterion_LabelSmoothing(vocab_size=trg_vocab_size,
                                            padding_idx=dataloader.pad_idx,
                                            smoothing_eps=label_smoothing_eps)
    else:
        criterion = nn.CrossEntropyLoss(ignore_index=dataloader.pad_idx)
    optimizer = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9)
    lr_scheduler = Warmup_scheduler(optimizer, dim_model, warmup_steps)
    print('==> Criterion and optimizer ready.')

    # Step4 ====================================================================
    # Train and validate the model
    start_epoch = 0
    best_val_metric = 0

    if args.resume:
        assert os.path.exists(CHECKPOINT_FILE_PATH), 'No checkpoint file!'
        checkpoint = torch.load(CHECKPOINT_FILE_PATH)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        lr_scheduler.current_step = checkpoint['current_step']
        best_val_metric = checkpoint['best_val_metric']

    # Save the training information
    result_data = {}
    result_data['model']            = args.model
    result_data['dataset']          = args.dataset
    result_data['target epoch']     = args.epochs
    result_data['batch_size']       = args.batch_size

    # Initialize the result lists
    train_loss = []
    train_ppl = []
    train_bleu = []

    val_loss = []
    val_ppl = []
    val_bleu = []

    # Check the directory of the file path
    if not os.path.exists(os.path.dirname(RESULT_FILE_PATH)):
        os.makedirs(os.path.dirname(RESULT_FILE_PATH))
    if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH)):
        os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH))
    print('==> Train ready.')

    for epoch in range(args.epochs):
        # strat after the checkpoint epoch
        if epoch < start_epoch:
            continue
        print("\n[Epoch: {:3d}/{:3d}]".format(epoch+1, args.epochs))
        epoch_time = time.time()
        #=======================================================================
        # train the model
        tloss, tmetric = train(model, train_loader, criterion,
                                    optimizer, lr_scheduler, dataloader)
        train_loss.append(tloss)
        train_ppl.append(tmetric[0])
        train_bleu.append(tmetric[1])

        # validate the model
        vloss, vmetric = val(model, val_loader, criterion, dataloader)
        val_loss.append(vloss)
        val_ppl.append(vmetric[0])
        val_bleu.append(vmetric[1])

        #=======================================================================
        current = time.time()

        # Save the current result
        result_data['current epoch']    = epoch
        result_data['train_loss']       = train_loss
        result_data['train_ppl']        = train_ppl
        result_data['train_bleu']       = train_bleu
        result_data['val_loss']         = val_loss
        result_data['val_ppl']          = val_ppl
        result_data['val_bleu']         = val_bleu

        # Save result_data as pkl file
        with open(RESULT_FILE_PATH, 'wb') as pkl_file:
            pickle.dump(result_data, pkl_file, protocol=pickle.HIGHEST_PROTOCOL)

        # Save the best checkpoint
        if vmetric[1] > best_val_metric:
            best_val_metric = vmetric[1]
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'current_step': lr_scheduler.current_step,
                'best_val_metric': best_val_metric,
                }, BEST_CHECKPOINT_FILE_PATH)

        # Save the current checkpoint
        torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'current_step': lr_scheduler.current_step,
            'val_metric': vmetric[0],
            # 'best_val_metric': best_val_metric,
            }, CHECKPOINT_FILE_PATH)

        # Print the information on the console
        print("model                : {}".format(args.model))
        print("dataset              : {}".format(args.dataset))
        print("batch_size           : {}".format(args.batch_size))
        print("current step         : {:d}".format(lr_scheduler.current_step))
        print("current lrate        : {:f}".format(optimizer.param_groups[0]['lr']))
        print("train/val loss       : {:f}/{:f}".format(tloss,vloss))
        print("train/val PPL        : {:f}/{:f}".format(tmetric[0],vmetric[0]))
        print("train/val BLEU       : {:f}/{:f}".format(tmetric[1],vmetric[1]))
        print("epoch time           : {0:.3f} sec".format(current - epoch_time))
        print("Current elapsed time : {0:.3f} sec".format(current - start))
    print('==> Train done.')

    print(' '.join(['Results have been saved at', RESULT_FILE_PATH]))
    print(' '.join(['Checkpoints have been saved at', CHECKPOINT_FILE_PATH]))
Ejemplo n.º 4
0
                     batch_size=config.batch_size,
                     shuffle=True,
                     collate_fn=generate_batch)
print(next(iter(dataset)))
sys.exit(0)
train_data, test_data = train_test_split(dataset,
                                         test_size=0.25,
                                         random_state=42)

train_iter = DataLoader(train_data,
                        batch_size=config.batch_size,
                        shuffle=True,
                        collate_fn=generate_batch)
test_iter = DataLoader(test_data,
                       batch_size=config.batch_size,
                       shuffle=True,
                       collate_fn=generate_batch)

if __name__ == '__main__':
    # Intialize the model parameters
    model = Transformer()
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    pl.seed_everything(42)
    trainer = pl.Trainer(auto_scale_batch_size='power', deterministic=True)
    trainer.fit(model, DataLoader(dataset))
'''
python --src_lang en --trg_lang fr --src_file_path data/english.txt --trg_file_path data/french.txt
'''
Ejemplo n.º 5
0
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


transformer = Transformer(num_layers,
                          d_model,
                          num_heads,
                          dff,
                          input_vocab_size,
                          target_vocab_size,
                          pe_input=input_vocab_size,
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')
Ejemplo n.º 6
0
    input_vocab_size = tokenizer_X.vocab_size + 2
    target_vocab_size = tokenizer_y.vocab_size + 2

    print("...done.")
    print("\nDefining the Transformer model...")

    # Using the Adam optimizer
    optimizer = tf.keras.optimizers.Adam(CustomSchedule(D_MODEL),
                                         beta_1=BETA_1,
                                         beta_2=BETA_2,
                                         epsilon=EPSILON)

    train_loss = tf.keras.metrics.Mean(name="train_loss")
    train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name="train_acc")

    transformer = Transformer(NUM_LAYERS, D_MODEL, NUM_HEADS, DFF,
                              input_vocab_size, target_vocab_size, DROPOUT)

    print("...done.")
    print("\nTraining...\n")

    # Model saving
    ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

    if CONTINUE_FROM_CKPT:
        # Load last checkpoint
        ckpt_manager = tf.train.CheckpointManager(ckpt,
                                                  CHECKPOINT_PATH,
                                                  max_to_keep=999)
    else:
        if not os.path.isdir(f"checkpoints"):
            os.mkdir(f"checkpoints")
Ejemplo n.º 7
0
def evaluate_and_visualize_attention(ckpt_list):
    #===========================================================================
    for ckpt_name in ckpt_list:
        #=======================================================================
        # Parsing the hyper-parameters
        parsing_list = ckpt_name.split('.')[0].split('_')

        # Setting constants
        model_type = parsing_list[0]
        dataset_type = parsing_list[1]
        batch_size = 512  #512#int(parsing_list[2])
        dim_model = int(parsing_list[3])
        dim_ff = int(parsing_list[4])
        dim_K = int(parsing_list[5])
        num_layers = int(parsing_list[6])
        num_heads = int(parsing_list[7])
        dropout_p = 0.1  # Dropout probability
        label_smoothing_eps = 0.1  # Label smoothing epsilon
        max_src_len = 46  # Maximum source input length (Multi30k)
        max_trg_len = 45  # Maximum target input length (Multi30k)

        # Step1 ================================================================
        # Load dataset
        if dataset_type == 'WMT2014':
            dataloader = WMT2014_Dataloader()
        elif dataset_type == 'Multi30k':
            dataloader = Multi30k_Dataloader()
        else:
            assert False, "Please select the proper dataset."

        test_loader = dataloader.get_test_loader(batch_size=batch_size)
        print('==> DataLoader ready.')

        # Step2 ================================================================
        # Make Translation model
        if model_type == 'Transformer':
            src_vocab_size = len(dataloader.SRC.vocab)
            trg_vocab_size = len(dataloader.TRG.vocab)
            model = Transformer(src_vocab_size, trg_vocab_size, max_src_len,
                                max_trg_len, dim_model, dim_K, num_layers,
                                num_heads, dim_ff, dropout_p)
        else:
            assert False, "Please select the proper model."

        # Check DataParallel available
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)

        # Check CUDA available
        if torch.cuda.is_available():
            model.cuda()

        # Count the model parameters
        num_params = sum(p.numel() for p in model.parameters()
                         if p.requires_grad)
        print('==> Model ready.')

        # Step3 ====================================================================
        # Set loss function
        criterion = nn.CrossEntropyLoss(ignore_index=dataloader.pad_idx)
        print('==> Criterion ready.')

        # Step4 ====================================================================
        # Test the model
        checkpoint = torch.load(os.path.join(CHECKPOINT_PATH, ckpt_name))
        model.load_state_dict(checkpoint['model_state_dict'])

        # test the model
        loss, metric = test(model, test_loader, criterion, dataloader)

        # Print the result on the console
        print("model                  : {}".format(model_type))
        print("dataset                : {}".format(dataset_type))
        print('# of model parameters  : {:d}'.format(num_params))
        print("batch_size             : {}".format(batch_size))
        print("test loss              : {:f}".format(loss))
        print("test PPL               : {:f}".format(metric[0]))
        print("test BLEU              : {:f}".format(metric[1]))
        print('-' * 50)
    print('==> Evaluation done.')
Ejemplo n.º 8
0
def train():
    """ 训练入口 """
    # ************************************
    # 1. 准备数据集
    # ************************************
    train_dataset, val_dataset, input_vocab_size, target_vocab_size = gen_dataset(CONFIG['model']['batch_size'])

    # ************************************
    # 2. 机器学习三大组件:模型,损失函数,优化器
    # ************************************
    d_model = CONFIG['model']['d_model']
    learning_rate = CustomSchedule(d_model)
    optimizer = keras.optimizers.Adam(learning_rate, beta_1=0.9,
                                      beta_2=0.98, epsilon=1e-9)
    loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    
    train_loss = keras.metrics.Mean(name='train_loss')
    train_accuracy = keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
    
    transformer = Transformer(
        num_layers=CONFIG['model']['num_layers'],
        d_model=CONFIG['model']['d_model'],
        num_heads=CONFIG['model']['num_heads'],
        dff=CONFIG['model']['dff'],
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size
    )

    # ************************************
    # 3. 训练流程的准备
    # ************************************
    
    # 定义checkpoints管理器
    checkpoint_path = get_path(CONFIG['checkpoint_path'])
    ckpt = tf.train.Checkpoint(model=transformer, optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
    
    # 使用Tensorboard可视化训练过程的loss和accuracy
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = get_path('logs/gradient_tape/' + current_time + '/train')
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # @tf.function
    def train_step(inp, tar):
        tar_input = tar[:, :-1]     # 目标序列首位插入了开始符 [start],相当于做了右移
        tar_real = tar[:, 1:]       # 目标序列本身
    
        enc_padding_mask, combine_mask, dec_padding_mask = create_mask(inp, tar_input)
    
        with tf.GradientTape() as tape:
            predictions, _ = transformer(inp,
                                         tar=tar_input,
                                         training=True,
                                         enc_padding_mask=enc_padding_mask,
                                         look_ahead_mask=combine_mask,
                                         dec_padding_mask=dec_padding_mask)
            loss = loss_function(tar_real, predictions, loss_object)
            
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        
        train_loss(loss)
        train_accuracy(tar_real, predictions)
        
    # ************************************
    # 4. 开始训练
    # ************************************
    for epoch in range(CONFIG['model']['epochs']):
        start = time.time()
        train_loss.reset_states()
        train_accuracy.reset_states()
        
        for (batch, (inp, tar)) in enumerate(train_dataset):
            train_step(inp, tar)
            
            # 记录loss和accuracy
            with train_summary_writer.as_default():
                tf.summary.scalar('loss', train_loss.result(), step=epoch)
                tf.summary.scalar('accuracy', train_accuracy.result(), step=epoch)
            
            if batch % 100 == 0:
                print('Epochs {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                    epoch+1, batch, train_loss.result(), train_accuracy.result()))
        
        if (epoch + 1) % 5 == 0:
            ckpt_save_path = ckpt_manager.save()
            print('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))
            
        print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result()))
        print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
        
        # 执行验证step
        validate(transformer, val_dataset)
Ejemplo n.º 9
0
def predict(invocations, result_cnt=5):

    english = Field(tokenize=tokenize_eng,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>")
    bash = Field(tokenize=tokenize_bash,
                 lower=True,
                 init_token="<sos>",
                 eos_token="<eos>")
    fields = {"English": ("eng", english), "Bash": ("bash", bash)}
    train_data, test_data = TabularDataset.splits(
        path="",
        train="src/submission_code/train.json",
        test="src/submission_code/test.json",
        format="json",
        fields=fields)
    english.build_vocab(train_data, max_size=10000, min_freq=2)
    bash.build_vocab(train_data, max_size=10000, min_freq=2)

    # We're ready to define everything we need for training our Seq2Seq model
    device = torch.device("cpu")
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    load_model = True
    save_model = False

    learning_rate = 1e-4

    # Model hyperparameters
    src_vocab_size = len(english.vocab)
    trg_vocab_size = len(bash.vocab)
    embedding_size = 256
    num_heads = 8
    num_encoder_layers = 8
    num_decoder_layers = 8
    dropout = 0.10
    max_len = 100
    forward_expansion = 2048
    src_pad_idx = english.vocab.stoi["<pad>"]

    model = Transformer(
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    if load_model:
        load_checkpoint(
            torch.load("src/my_checkpoint.pth.tar", map_location='cpu'), model,
            optimizer)
    """
    Function called by the evaluation script to interface the participants model
    `predict` function accepts the natural language invocations as input, and returns
    the predicted commands along with confidences as output. For each invocation,
    `result_cnt` number of predicted commands are expected to be returned.

    Args:
        1. invocations : `list (str)` : list of `n_batch` (default 16) natural language invocations
        2. result_cnt : `int` : number of predicted commands to return for each invocation

    Returns:
        1. commands : `list [ list (str) ]` : a list of list of strings of shape (n_batch, result_cnt)
        2. confidences: `list[ list (float) ]` : confidences corresponding to the predicted commands
                                                 confidence values should be between 0.0 and 1.0.
                                                 Shape: (n_batch, result_cnt)
    """

    n_batch = len(invocations)

    # `commands` and `confidences` have shape (n_batch, result_cnt)
    commands = [[''] * result_cnt for _ in range(n_batch)]
    cf = [1.0] * (result_cnt - 1)
    cf.append(0)
    confidences = [cf for _ in range(n_batch)]

    ################################################################################################
    #     Participants should add their codes to fill predict `commands` and `confidences` here    #
    ################################################################################################
    for idx, inv in enumerate(invocations):

        # Call the translate method to retrieve translations and scores
        prediction = translate_sentence(model,
                                        inv,
                                        english,
                                        bash,
                                        device,
                                        max_length=30)[:-1]
        temp = " ".join(prediction)
        top_commands = [temp] * 5
        print(top_commands)
        # For testing evalAI docker push, just fill top command - just need to check
        # if tellina imports work correctly right now
        for i in range(result_cnt):
            commands[idx][i] = top_commands[i]

    ################################################################################################
    #                               Participant code block ends                                    #
    ################################################################################################

    return commands, confidences
Ejemplo n.º 10
0
# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

train_iterator, test_iterator = BucketIterator.splits((train_data, test_data),
                                                      batch_size=batch_size,
                                                      device=device)

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

wandb.watch(model)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       factor=0.1,
                                                       patience=10,
                                                       verbose=True)
            optimizer.step()
            total_loss += loss.item()
            accuracy += (torch.argmax(logit, dim=1)==target).sum().item()
        print('>>> Epoch_{}, Train loss is {}, Accuracy:{} \n'.format(epoch,loss.item()/total_train_num, accuracy/total_train_num))
        model.eval()
        total_loss = 0.0
        accuracy = 0
        total_valid_num = len(dev_iter.dataset)
        for batch in dev_iter:
            feature = batch.text  # (W,N) (N)
            target = batch.label
            with torch.no_grad():
                feature = torch.t(feature)
            feature, target = feature.to(device), target.to(device)
            out = model(feature)
            loss = F.cross_entropy(out, target)
            total_loss += loss.item()
            accuracy += (torch.argmax(out, dim=1)==target).sum().item()
        print('>>> Epoch_{}, Valid loss:{}, Accuracy:{} \n'.format(epoch, total_loss/total_valid_num, accuracy/total_valid_num))

def saveModel(model,name):
    torch.save(model, 'done_model/'+name+'_model.pkl')

model = Transformer()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, val_iter, test_iter = DataSet.getIter()

if __name__ == '__main__':
    train_model(train_iter, val_iter, model, device)
    saveModel(model,'transformer')
    test_model(test_iter, model, device)