Ejemplo n.º 1
0
def training(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #===================================#
    #==============Logging==============#
    #===================================#

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    handler = TqdmLoggingHandler()
    handler.setFormatter(
        logging.Formatter(" %(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S"))
    logger.addHandler(handler)
    logger.propagate = False

    #===================================#
    #============Data Load==============#
    #===================================#

    # 1) Data open
    write_log(logger, "Load data...")
    gc.disable()
    with open(os.path.join(args.preprocess_path, 'processed.pkl'), 'rb') as f:
        data_ = pickle.load(f)
        train_src_indices = data_['train_src_indices']
        valid_src_indices = data_['valid_src_indices']
        train_trg_indices = data_['train_trg_indices']
        valid_trg_indices = data_['valid_trg_indices']
        src_word2id = data_['src_word2id']
        trg_word2id = data_['trg_word2id']
        src_vocab_num = len(src_word2id)
        trg_vocab_num = len(trg_word2id)
        del data_
    gc.enable()
    write_log(logger, "Finished loading data!")

    # 2) Dataloader setting
    dataset_dict = {
        'train':
        CustomDataset(train_src_indices,
                      train_trg_indices,
                      min_len=args.min_len,
                      src_max_len=args.src_max_len,
                      trg_max_len=args.trg_max_len),
        'valid':
        CustomDataset(valid_src_indices,
                      valid_trg_indices,
                      min_len=args.min_len,
                      src_max_len=args.src_max_len,
                      trg_max_len=args.trg_max_len),
    }
    dataloader_dict = {
        'train':
        DataLoader(dataset_dict['train'],
                   drop_last=True,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True,
                   num_workers=args.num_workers),
        'valid':
        DataLoader(dataset_dict['valid'],
                   drop_last=False,
                   batch_size=args.batch_size,
                   shuffle=False,
                   pin_memory=True,
                   num_workers=args.num_workers)
    }
    write_log(
        logger,
        f"Total number of trainingsets  iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}"
    )

    #===================================#
    #===========Train setting===========#
    #===================================#

    # 1) Model initiating
    write_log(logger, 'Instantiating model...')
    model = Transformer(
        src_vocab_num=src_vocab_num,
        trg_vocab_num=trg_vocab_num,
        pad_idx=args.pad_id,
        bos_idx=args.bos_id,
        eos_idx=args.eos_id,
        d_model=args.d_model,
        d_embedding=args.d_embedding,
        n_head=args.n_head,
        dim_feedforward=args.dim_feedforward,
        num_common_layer=args.num_common_layer,
        num_encoder_layer=args.num_encoder_layer,
        num_decoder_layer=args.num_decoder_layer,
        src_max_len=args.src_max_len,
        trg_max_len=args.trg_max_len,
        dropout=args.dropout,
        embedding_dropout=args.embedding_dropout,
        trg_emb_prj_weight_sharing=args.trg_emb_prj_weight_sharing,
        emb_src_trg_weight_sharing=args.emb_src_trg_weight_sharing,
        parallel=args.parallel)
    model.train()
    model = model.to(device)
    tgt_mask = model.generate_square_subsequent_mask(args.trg_max_len - 1,
                                                     device)

    # 2) Optimizer & Learning rate scheduler setting
    optimizer = optimizer_select(model, args)
    scheduler = shceduler_select(optimizer, dataloader_dict, args)
    scaler = GradScaler()

    # 3) Model resume
    start_epoch = 0
    if args.resume:
        write_log(logger, 'Resume model...')
        checkpoint = torch.load(
            os.path.join(args.save_path, 'checkpoint.pth.tar'))
        start_epoch = checkpoint['epoch'] + 1
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        scaler.load_state_dict(checkpoint['scaler'])
        del checkpoint

    #===================================#
    #=========Model Train Start=========#
    #===================================#

    best_val_acc = 0

    write_log(logger, 'Traing start!')

    for epoch in range(start_epoch + 1, args.num_epochs + 1):
        start_time_e = time()
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
            if phase == 'valid':
                write_log(logger, 'Validation start...')
                val_loss = 0
                val_acc = 0
                model.eval()
            for i, (src, trg) in enumerate(
                    tqdm(dataloader_dict[phase],
                         bar_format='{l_bar}{bar:30}{r_bar}{bar:-2b}')):

                # Optimizer setting
                optimizer.zero_grad(set_to_none=True)

                # Input, output setting
                src = src.to(device, non_blocking=True)
                trg = trg.to(device, non_blocking=True)

                trg_sequences_target = trg[:, 1:]
                non_pad = trg_sequences_target != args.pad_id
                trg_sequences_target = trg_sequences_target[
                    non_pad].contiguous().view(-1)

                # Train
                if phase == 'train':

                    # Loss calculate
                    with autocast():
                        predicted = model(src,
                                          trg[:, :-1],
                                          tgt_mask,
                                          non_pad_position=non_pad)
                        predicted = predicted.view(-1, predicted.size(-1))
                        loss = label_smoothing_loss(predicted,
                                                    trg_sequences_target,
                                                    args.pad_id)

                    scaler.scale(loss).backward()
                    scaler.unscale_(optimizer)
                    clip_grad_norm_(model.parameters(), args.clip_grad_norm)
                    scaler.step(optimizer)
                    scaler.update()

                    if args.scheduler in ['constant', 'warmup']:
                        scheduler.step()
                    if args.scheduler == 'reduce_train':
                        scheduler.step(loss)

                    # Print loss value only training
                    if i == 0 or freq == args.print_freq or i == len(
                            dataloader_dict['train']):
                        acc = (predicted.max(dim=1)[1] == trg_sequences_target
                               ).sum() / len(trg_sequences_target)
                        iter_log = "[Epoch:%03d][%03d/%03d] train_loss:%03.3f | train_acc:%03.2f%% | learning_rate:%1.6f | spend_time:%02.2fmin" % \
                            (epoch, i, len(dataloader_dict['train']),
                            loss.item(), acc*100, optimizer.param_groups[0]['lr'],
                            (time() - start_time_e) / 60)
                        write_log(logger, iter_log)
                        freq = 0
                    freq += 1

                # Validation
                if phase == 'valid':
                    with torch.no_grad():
                        predicted = model(src,
                                          trg[:, :-1],
                                          tgt_mask,
                                          non_pad_position=non_pad)
                        loss = F.cross_entropy(predicted, trg_sequences_target)
                    val_loss += loss.item()
                    val_acc += (predicted.max(dim=1)[1] == trg_sequences_target
                                ).sum() / len(trg_sequences_target)
                    if args.scheduler == 'reduce_valid':
                        scheduler.step(val_loss)
                    if args.scheduler == 'lambda':
                        scheduler.step()

            if phase == 'valid':
                val_loss /= len(dataloader_dict[phase])
                val_acc /= len(dataloader_dict[phase])
                write_log(logger, 'Validation Loss: %3.3f' % val_loss)
                write_log(logger,
                          'Validation Accuracy: %3.2f%%' % (val_acc * 100))
                if val_acc > best_val_acc:
                    write_log(logger, 'Checkpoint saving...')
                    torch.save(
                        {
                            'epoch': epoch,
                            'model': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'scheduler': scheduler.state_dict(),
                            'scaler': scaler.state_dict()
                        }, f'checkpoint_{args.parallel}.pth.tar')
                    best_val_acc = val_acc
                    best_epoch = epoch
                else:
                    else_log = f'Still {best_epoch} epoch accuracy({round(best_val_acc.item()*100, 2)})% is better...'
                    write_log(logger, else_log)

    # 3) Print results
    print(f'Best Epoch: {best_epoch}')
    print(f'Best Accuracy: {round(best_val_acc.item(), 2)}')
Ejemplo n.º 2
0
def main(proc_id, args):
    trg_sp = spm.SentencePieceProcessor()
    trg_sp.Load(args.spm_trg_path)
    trg_vocab_num = trg_sp.piece_size()
    bos_id = trg_sp.bos_id()
    eos_id = trg_sp.eos_id()
    pad_id = trg_sp.pad_id()
    src_vocab = requests.get(f'{args.api_url}/getMetaData').json()['src_vocab']
    unk_id = src_vocab['<unk>']

    device = torch.device(f"cuda:{proc_id}")
    model = Transformer(len(src_vocab),
                        trg_vocab_num,
                        pad_idx=pad_id,
                        bos_idx=bos_id,
                        eos_idx=eos_id,
                        src_max_len=args.src_max_len,
                        trg_max_len=args.trg_max_len,
                        d_model=args.d_model,
                        d_embedding=args.d_embedding,
                        n_head=args.n_head,
                        dim_feedforward=args.dim_feedforward,
                        num_encoder_layer=args.num_encoder_layer,
                        num_decoder_layer=args.num_decoder_layer,
                        num_mask_layer=args.num_mask_layer)

    model.load_state_dict(
        torch.load(args.checkpoint_path, map_location=device)['model'])
    model.src_output_linear = None
    model.src_output_linear2 = None
    model.src_output_norm = None
    model.mask_encoders = None
    model = model.to(device)
    model = model.eval()

    tgt_masks = {
        l: model.generate_square_subsequent_mask(l, device)
        for l in range(1, args.trg_max_len + 1)
    }

    while True:
        data = requests.get(f'{args.api_url}/getData').json()
        pred_data = {'file': data['file'], 'content': []}
        parsed_ids = []
        for d in data['content']:
            parsed_id = [src_vocab.get(c, unk_id) for c in d['hanja']]
            if args.min_len <= len(parsed_id) <= args.src_max_len:
                input_id = np.zeros(args.src_max_len, dtype=np.int64)
                input_id[:len(parsed_id)] = parsed_id
                parsed_ids.append(input_id)
                pred_data['content'].append(d)

        num_iter = ceil(len(parsed_ids) / args.batch_size)
        batch_size_ = args.batch_size
        predicted_num = 0

        with torch.no_grad():
            batch_indices = torch.arange(0,
                                         args.beam_size * args.batch_size,
                                         args.beam_size,
                                         device=device)
            for iter_ in range(num_iter):
                iter_time = time()
                src_sequences = parsed_ids[iter_ *
                                           args.batch_size:(iter_ + 1) *
                                           args.batch_size]

                scores_save = torch.zeros(args.beam_size * args.batch_size,
                                          1,
                                          device=device)
                top_k_scores = torch.zeros(args.beam_size * args.batch_size,
                                           1,
                                           device=device)
                complete_seqs = dict()
                complete_ind = set()
                if len(src_sequences) < args.batch_size:
                    batch_size_ = len(src_sequences)
                    batch_indices = torch.arange(0,
                                                 args.beam_size * batch_size_,
                                                 args.beam_size,
                                                 device=device)
                    scores_save = torch.zeros(args.beam_size * batch_size_,
                                              1,
                                              device=device)
                    top_k_scores = torch.zeros(args.beam_size * batch_size_,
                                               1,
                                               device=device)

                src_sequences = torch.cat([
                    torch.cuda.LongTensor(seq, device=device)
                    for seq in src_sequences
                ])
                src_sequences = src_sequences.view(batch_size_,
                                                   args.src_max_len)

                # Encoding
                # encoder_out: (src_seq, batch_size, d_model), src_key_padding_mask: (batch_size, src_seq)
                encoder_out = model.src_embedding(src_sequences).transpose(
                    0, 1)
                src_key_padding_mask = (src_sequences == pad_id)
                for encoder in model.encoders:
                    encoder_out = encoder(
                        encoder_out, src_key_padding_mask=src_key_padding_mask)

                # Expanding
                # encoder_out: (src_seq, batch_size*k, d_model), src_key_padding_mask: (batch_size*k, src_seq)
                src_seq_size = encoder_out.size(0)
                src_key_padding_mask = src_key_padding_mask.view(
                    batch_size_, 1, -1).repeat(1, args.beam_size, 1)
                src_key_padding_mask = src_key_padding_mask.view(
                    -1, src_seq_size)
                encoder_out = encoder_out.view(-1, batch_size_, 1,
                                               args.d_model).repeat(
                                                   1, 1, args.beam_size, 1)
                encoder_out = encoder_out.view(src_seq_size, -1, args.d_model)

                # Decoding start token setting
                seqs = torch.tensor([[bos_id]],
                                    dtype=torch.long,
                                    device=device)
                seqs = seqs.repeat(args.beam_size * batch_size_,
                                   1).contiguous()

                for step in range(model.trg_max_len):
                    # Decoder setting
                    # tgt_mask: (out_seq), tgt_key_padding_mask: (batch_size * k, out_seq)
                    tgt_mask = tgt_masks[seqs.size(1)]
                    tgt_key_padding_mask = (seqs == pad_id)

                    # Decoding sentence
                    # decoder_out: (out_seq, batch_size * k, d_model)
                    decoder_out = model.trg_embedding(seqs).transpose(0, 1)
                    for decoder in model.decoders:
                        decoder_out = decoder(
                            decoder_out,
                            encoder_out,
                            tgt_mask=tgt_mask,
                            memory_key_padding_mask=src_key_padding_mask,
                            tgt_key_padding_mask=tgt_key_padding_mask)

                    # Score calculate
                    # scores: (batch_size * k, vocab_num)
                    scores = F.gelu(model.trg_output_linear(decoder_out[-1]))
                    scores = model.trg_output_linear2(
                        model.trg_output_norm(scores))
                    scores = F.log_softmax(scores, dim=1)

                    # Repetition Penalty
                    if step > 0 and args.repetition_penalty > 0:
                        prev_ix = next_word_inds.view(-1)
                        for index, prev_token_id in enumerate(prev_ix):
                            scores[index][
                                prev_token_id] *= args.repetition_penalty

                    # Add score
                    scores = top_k_scores.expand_as(scores) + scores
                    if step == 0:
                        # scores: (batch_size, vocab_num)
                        # top_k_scores: (batch_size, k)
                        scores = scores[::args.beam_size]
                        # set eos token probability zero in first step
                        scores[:, eos_id] = float('-inf')
                        top_k_scores, top_k_words = scores.topk(
                            args.beam_size, 1, True, True)
                    else:
                        # top_k_scores: (batch_size * k, out_seq)
                        top_k_scores, top_k_words = scores.view(
                            batch_size_, -1).topk(args.beam_size, 1, True,
                                                  True)

                    # Previous and Next word extract
                    # seqs: (batch_size * k, out_seq + 1)
                    prev_word_inds = top_k_words // trg_vocab_num
                    next_word_inds = top_k_words % trg_vocab_num
                    top_k_scores = top_k_scores.view(
                        batch_size_ * args.beam_size, -1)
                    top_k_words = top_k_words.view(
                        batch_size_ * args.beam_size, -1)
                    seqs = seqs[prev_word_inds.view(-1) +
                                batch_indices.unsqueeze(1).repeat(
                                    1, args.beam_size).view(-1)]
                    seqs = torch.cat([
                        seqs,
                        next_word_inds.view(args.beam_size * batch_size_, -1)
                    ],
                                     dim=1)

                    # Find and Save Complete Sequences Score
                    eos_ind = torch.where(next_word_inds.view(-1) == eos_id)[0]
                    if len(eos_ind) > 0:
                        eos_ind = eos_ind.tolist()
                        complete_ind_add = set(eos_ind) - complete_ind
                        complete_ind_add = list(complete_ind_add)
                        complete_ind.update(eos_ind)
                        if len(complete_ind_add) > 0:
                            scores_save[complete_ind_add] = top_k_scores[
                                complete_ind_add]
                            for ix in complete_ind_add:
                                complete_seqs[ix] = seqs[ix].tolist()

                # If eos token doesn't exist in sequence
                score_save_pos = torch.where(scores_save == 0)
                if len(score_save_pos[0]) > 0:
                    for ix in score_save_pos[0].tolist():
                        complete_seqs[ix] = seqs[ix].tolist()
                    scores_save[score_save_pos] = top_k_scores[score_save_pos]

                # Beam Length Normalization
                lp = torch.tensor([
                    len(complete_seqs[i])
                    for i in range(batch_size_ * args.beam_size)
                ],
                                  device=device)
                lp = (((lp + args.beam_size)**args.beam_alpha) /
                      ((args.beam_size + 1)**args.beam_alpha))
                scores_save = scores_save / lp.unsqueeze(1)

                # Predicted and Label processing
                ind = scores_save.view(batch_size_, args.beam_size,
                                       -1).argmax(dim=1)
                ind = (ind.view(-1) + batch_indices).tolist()
                for i in ind:
                    predicted_sequence = trg_sp.decode_ids(complete_seqs[i])
                    pred_data['content'][predicted_num][
                        'predicted_sequence'] = predicted_sequence
                    predicted_num += 1

                iter_time = time() - iter_time
                print(
                    f"{proc_id} - iter: {iter_ + 1}/{num_iter}, {iter_time:.2f}"
                )

        res = requests.post(f'{args.api_url}/commitData',
                            json=pred_data).json()
        print(f"{proc_id} - Progress: {res['progress']}, {pred_data['file']}")
        if res['progress'] == 'finish':
            return
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    handler = TqdmLoggingHandler()
    handler.setFormatter(logging.Formatter(" %(asctime)s - %(message)s"))
    logger.addHandler(handler)
    logger.propagate = False

    write_log(logger, "Load data")

    def load_data(args):
        gc.disable()
        with open(f"{args.preprocessed_data_path}/hanja_korean_word2id.pkl",
                  "rb") as f:
            data = pickle.load(f)
            hanja_word2id = data['hanja_word2id']
            korean_word2id = data['korean_word2id']

        with open(f"{args.preprocessed_data_path}/preprocessed_test.pkl",
                  "rb") as f:
            data = pickle.load(f)
            test_hanja_indices = data['hanja_indices']
            test_korean_indices = data['korean_indices']

        gc.enable()
        write_log(logger, "Finished loading data!")
        return hanja_word2id, korean_word2id, test_hanja_indices, test_korean_indices

    hanja_word2id, korean_word2id, test_hanja_indices, test_korean_indices = load_data(
        args)
    hanja_vocab_num = len(hanja_word2id)
    korean_vocab_num = len(korean_word2id)

    hk_dataset = HanjaKoreanDataset(test_hanja_indices,
                                    test_korean_indices,
                                    min_len=args.min_len,
                                    src_max_len=args.src_max_len,
                                    trg_max_len=args.trg_max_len)
    hk_loader = DataLoader(hk_dataset,
                           drop_last=True,
                           batch_size=args.hk_batch_size,
                           num_workers=4,
                           prefetch_factor=4,
                           pin_memory=True)
    write_log(logger, f"hanja-korean: {len(hk_dataset)}, {len(hk_loader)}")
    del test_hanja_indices, test_korean_indices

    write_log(logger, "Build model")
    model = Transformer(hanja_vocab_num,
                        korean_vocab_num,
                        pad_idx=args.pad_idx,
                        bos_idx=args.bos_idx,
                        eos_idx=args.eos_idx,
                        src_max_len=args.src_max_len,
                        trg_max_len=args.trg_max_len,
                        d_model=args.d_model,
                        d_embedding=args.d_embedding,
                        n_head=args.n_head,
                        dim_feedforward=args.dim_feedforward,
                        num_encoder_layer=args.num_encoder_layer,
                        num_decoder_layer=args.num_decoder_layer,
                        num_mask_layer=args.num_mask_layer)

    model.load_state_dict(
        torch.load(args.checkpoint_path, map_location=device)['model'])
    model.src_output_linear = None
    model.src_output_linear2 = None
    model.src_output_norm = None
    model.mask_encoders = None
    model = model.to(device)
    model.eval()

    write_log(logger, "Load SentencePiece model")
    parser = spm.SentencePieceProcessor()
    parser.Load(os.path.join(args.preprocessed_data_path, 'm_korean.model'))

    predicted_list = list()
    label_list = list()
    every_batch = torch.arange(0,
                               args.beam_size * args.hk_batch_size,
                               args.beam_size,
                               device=device)
    tgt_masks = {
        l: model.generate_square_subsequent_mask(l, device)
        for l in range(1, args.trg_max_len + 1)
    }

    with torch.no_grad():
        for src_sequences, trg_sequences in tqdm(hk_loader):
            src_sequences = src_sequences.to(device)
            label_list.extend(trg_sequences.tolist())

            # Encoding
            # encoder_out: (src_seq, batch_size, d_model)
            # src_key_padding_mask: (batch_size, src_seq)
            encoder_out = model.src_embedding(src_sequences).transpose(0, 1)
            src_key_padding_mask = (src_sequences == model.pad_idx)
            for encoder in model.encoders:
                encoder_out = encoder(
                    encoder_out, src_key_padding_mask=src_key_padding_mask)

            # Expanding
            # encoder_out: (src_seq, batch_size * k, d_model)
            # src_key_padding_mask: (batch_size * k, src_seq)
            src_seq_size = encoder_out.size(0)
            src_key_padding_mask = src_key_padding_mask.view(
                args.hk_batch_size, 1, -1).repeat(1, args.beam_size, 1)
            src_key_padding_mask = src_key_padding_mask.view(-1, src_seq_size)
            encoder_out = encoder_out.view(-1, args.hk_batch_size, 1,
                                           args.d_model).repeat(
                                               1, 1, args.beam_size, 1)
            encoder_out = encoder_out.view(src_seq_size, -1, args.d_model)

            # Scores save vector & decoding list setting
            scores_save = torch.zeros(args.beam_size * args.hk_batch_size,
                                      1,
                                      device=device)
            top_k_scores = torch.zeros(args.beam_size * args.hk_batch_size,
                                       1,
                                       device=device)
            complete_seqs = dict()
            complete_ind = set()

            # Decoding start token setting
            seqs = torch.tensor([[model.bos_idx]],
                                dtype=torch.long,
                                device=device)
            seqs = seqs.repeat(args.beam_size * args.hk_batch_size,
                               1).contiguous()

            for step in range(model.trg_max_len):
                # Decoder setting
                # tgt_mask: (out_seq)
                # tgt_key_padding_mask: (batch_size * k, out_seq)
                tgt_mask = tgt_masks[seqs.size(1)]
                tgt_key_padding_mask = (seqs == model.pad_idx)

                # Decoding sentence
                # decoder_out: (out_seq, batch_size * k, d_model)
                decoder_out = model.trg_embedding(seqs).transpose(0, 1)
                for decoder in model.decoders:
                    decoder_out = decoder(
                        decoder_out,
                        encoder_out,
                        tgt_mask=tgt_mask,
                        memory_key_padding_mask=src_key_padding_mask,
                        tgt_key_padding_mask=tgt_key_padding_mask)

                # Score calculate
                # scores: (batch_size * k, vocab_num)
                scores = F.gelu(model.trg_output_linear(decoder_out[-1]))
                scores = model.trg_output_linear2(
                    model.trg_output_norm(scores))
                scores = F.log_softmax(scores, dim=1)

                # Repetition Penalty
                if step > 0 and args.repetition_penalty > 0:
                    prev_ix = next_word_inds.view(-1)
                    for index, prev_token_id in enumerate(prev_ix):
                        scores[index][prev_token_id] *= args.repetition_penalty

                # Add score
                scores = top_k_scores.expand_as(scores) + scores
                if step == 0:
                    # scores: (batch_size, vocab_num)
                    # top_k_scores: (batch_size, k)
                    scores = scores[::args.beam_size]
                    scores[:, model.eos_idx] = float(
                        '-inf')  # set eos token probability zero in first step
                    top_k_scores, top_k_words = scores.topk(
                        args.beam_size, 1, True, True)
                else:
                    # top_k_scores: (batch_size * k, out_seq)
                    top_k_scores, top_k_words = scores.view(
                        args.hk_batch_size, -1).topk(args.beam_size, 1, True,
                                                     True)

                # Previous and Next word extract
                # seqs: (batch_size * k, out_seq + 1)
                prev_word_inds = top_k_words // korean_vocab_num
                next_word_inds = top_k_words % korean_vocab_num
                top_k_scores = top_k_scores.view(
                    args.hk_batch_size * args.beam_size, -1)
                top_k_words = top_k_words.view(
                    args.hk_batch_size * args.beam_size, -1)
                seqs = seqs[prev_word_inds.view(-1) + every_batch.unsqueeze(
                    1).repeat(1, args.beam_size).view(-1)]
                seqs = torch.cat([
                    seqs,
                    next_word_inds.view(args.beam_size * args.hk_batch_size,
                                        -1)
                ],
                                 dim=1)

                # Find and Save Complete Sequences Score
                eos_ind = torch.where(
                    next_word_inds.view(-1) == model.eos_idx)[0]
                if len(eos_ind) > 0:
                    eos_ind = eos_ind.tolist()
                    complete_ind_add = set(eos_ind) - complete_ind
                    complete_ind_add = list(complete_ind_add)
                    complete_ind.update(eos_ind)
                    if len(complete_ind_add) > 0:
                        scores_save[complete_ind_add] = top_k_scores[
                            complete_ind_add]
                        for ix in complete_ind_add:
                            complete_seqs[ix] = seqs[ix].tolist()

            # If eos token doesn't exist in sequence
            score_save_pos = torch.where(scores_save == 0)
            if len(score_save_pos[0]) > 0:
                for ix in score_save_pos[0].tolist():
                    complete_seqs[ix] = seqs[ix].tolist()
                scores_save[score_save_pos] = top_k_scores[score_save_pos]

            # Beam Length Normalization
            lp = torch.tensor([
                len(complete_seqs[i])
                for i in range(args.hk_batch_size * args.beam_size)
            ],
                              device=device)
            lp = (((lp + args.beam_size)**args.beam_alpha) /
                  ((args.beam_size + 1)**args.beam_alpha))
            scores_save = scores_save / lp.unsqueeze(1)

            # Predicted and Label processing
            ind = scores_save.view(args.hk_batch_size, args.beam_size,
                                   -1).argmax(dim=1)
            ind_expand = ind.view(-1) + every_batch
            predicted_list.extend(
                [complete_seqs[i] for i in ind_expand.tolist()])

    with open(
            f'./results_beam_{args.beam_size}_{args.beam_alpha}_{args.repetition_penalty}.pkl',
            'wb') as f:
        pickle.dump(
            {
                'prediction':
                predicted_list,
                'label':
                label_list,
                'prediction_decode':
                [parser.DecodeIds(pred) for pred in predicted_list],
                'label_decode':
                [parser.DecodeIds(label) for label in label_list]
            }, f)
Ejemplo n.º 4
0
def main(args):
    comm = MPI.COMM_WORLD
    world_size = comm.Get_size()
    rank = comm.Get_rank()
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = str(args.master_port)
    torch.cuda.set_device(rank)
    dist.init_process_group(backend="nccl", world_size=world_size, rank=rank)
    device = torch.device("cuda")

    logger = None
    tb_logger = None
    if rank == 0:
        if not os.path.exists(args.save_path):
            os.mkdir(args.save_path)
        if not os.path.exists(args.tensorboard_log_dir):
            os.mkdir(args.tensorboard_log_dir)
        tb_logger = SummaryWriter(
            f"{args.tensorboard_log_dir}/{args.model_name}")

        logger = logging.getLogger(__name__)
        logger.setLevel(logging.DEBUG)
        handler = TqdmLoggingHandler()
        handler.setFormatter(logging.Formatter(" %(asctime)s - %(message)s"))
        logger.addHandler(handler)
        logger.propagate = False

    write_log(logger, "Load data")

    def load_data(args):
        gc.disable()
        with open(f"{args.preprocessed_data_path}/hanja_korean_word2id.pkl",
                  "rb") as f:
            data = pickle.load(f)
            hanja_word2id = data['hanja_word2id']
            korean_word2id = data['korean_word2id']

        with open(f"{args.preprocessed_data_path}/preprocessed_train.pkl",
                  "rb") as f:
            data = pickle.load(f)
            train_hanja_indices = data['hanja_indices']
            train_korean_indices = data['korean_indices']
            train_additional_hanja_indices = data['additional_hanja_indices']

        with open(f"{args.preprocessed_data_path}/preprocessed_valid.pkl",
                  "rb") as f:
            data = pickle.load(f)
            valid_hanja_indices = data['hanja_indices']
            valid_korean_indices = data['korean_indices']
            valid_additional_hanja_indices = data['additional_hanja_indices']

        gc.enable()
        write_log(logger, "Finished loading data!")
        return (hanja_word2id, korean_word2id, train_hanja_indices,
                train_korean_indices, train_additional_hanja_indices,
                valid_hanja_indices, valid_korean_indices,
                valid_additional_hanja_indices)

    # load data
    (hanja_word2id, korean_word2id, train_hanja_indices, train_korean_indices,
     train_additional_hanja_indices, valid_hanja_indices, valid_korean_indices,
     valid_additional_hanja_indices) = load_data(args)
    hanja_vocab_num = len(hanja_word2id)
    korean_vocab_num = len(korean_word2id)

    hk_dataset = HanjaKoreanDataset(train_hanja_indices,
                                    train_korean_indices,
                                    min_len=args.min_len,
                                    src_max_len=args.src_max_len,
                                    trg_max_len=args.trg_max_len)
    hk_sampler = DistributedSampler(hk_dataset,
                                    num_replicas=world_size,
                                    rank=rank)
    hk_loader = DataLoader(hk_dataset,
                           drop_last=True,
                           batch_size=args.hk_batch_size,
                           sampler=hk_sampler,
                           num_workers=args.num_workers,
                           prefetch_factor=4,
                           pin_memory=True)
    write_log(logger, f"hanja-korean: {len(hk_dataset)}, {len(hk_loader)}")

    h_dataset = HanjaDataset(train_hanja_indices,
                             train_additional_hanja_indices,
                             hanja_word2id,
                             min_len=args.min_len,
                             src_max_len=args.src_max_len)
    h_sampler = DistributedSampler(h_dataset,
                                   num_replicas=world_size,
                                   rank=rank)
    h_loader = DataLoader(h_dataset,
                          drop_last=True,
                          batch_size=args.h_batch_size,
                          sampler=h_sampler,
                          num_workers=args.num_workers,
                          prefetch_factor=4,
                          pin_memory=True)
    write_log(logger, f"hanja: {len(h_dataset)}, {len(h_loader)}")

    hk_valid_dataset = HanjaKoreanDataset(valid_hanja_indices,
                                          valid_korean_indices,
                                          min_len=args.min_len,
                                          src_max_len=args.src_max_len,
                                          trg_max_len=args.trg_max_len)
    hk_valid_sampler = DistributedSampler(hk_valid_dataset,
                                          num_replicas=world_size,
                                          rank=rank)
    hk_valid_loader = DataLoader(hk_valid_dataset,
                                 drop_last=True,
                                 batch_size=args.hk_batch_size,
                                 sampler=hk_valid_sampler)
    write_log(
        logger,
        f"hanja-korean-valid: {len(hk_valid_dataset)}, {len(hk_valid_loader)}")

    h_valid_dataset = HanjaDataset(valid_hanja_indices,
                                   valid_additional_hanja_indices,
                                   hanja_word2id,
                                   min_len=args.min_len,
                                   src_max_len=args.src_max_len)
    h_valid_sampler = DistributedSampler(h_valid_dataset,
                                         num_replicas=world_size,
                                         rank=rank)
    h_valid_loader = DataLoader(h_valid_dataset,
                                drop_last=True,
                                batch_size=args.h_batch_size,
                                sampler=h_valid_sampler)
    write_log(logger, f"hanja: {len(h_valid_dataset)}, {len(h_valid_loader)}")

    del (train_hanja_indices, train_korean_indices,
         train_additional_hanja_indices, valid_hanja_indices,
         valid_korean_indices, valid_additional_hanja_indices)

    write_log(logger, "Build model")
    model = Transformer(hanja_vocab_num,
                        korean_vocab_num,
                        pad_idx=args.pad_idx,
                        bos_idx=args.bos_idx,
                        eos_idx=args.eos_idx,
                        src_max_len=args.src_max_len,
                        trg_max_len=args.trg_max_len,
                        d_model=args.d_model,
                        d_embedding=args.d_embedding,
                        n_head=args.n_head,
                        dropout=args.dropout,
                        dim_feedforward=args.dim_feedforward,
                        num_encoder_layer=args.num_encoder_layer,
                        num_decoder_layer=args.num_decoder_layer,
                        num_mask_layer=args.num_mask_layer).to(device)
    model = nn.parallel.DistributedDataParallel(model,
                                                device_ids=[device],
                                                find_unused_parameters=True)
    for param in model.parameters():
        dist.broadcast(param.data, 0)

    dist.barrier()
    write_log(
        logger,
        f"Total Parameters: {sum([p.nelement() for p in model.parameters()])}")

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = Ralamb(params=optimizer_grouped_parameters, lr=args.lr)

    total_iters = round(
        len(hk_loader) / args.num_grad_accumulate * args.epochs)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, round(total_iters * args.warmup_ratio), total_iters)
    scaler = GradScaler()

    start_epoch = 0
    if args.resume:

        def load_states():
            checkpoint = torch.load(
                f'{args.save_path}/{args.model_name}_ckpt.pt',
                map_location='cpu')
            start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            scaler.load_state_dict(checkpoint['scaler'])
            return start_epoch

        start_epoch = load_states()

    write_log(logger, f"Training start - Total iter: {total_iters}\n")
    iter_num = round(len(hk_loader) / args.num_grad_accumulate)
    global_step = start_epoch * iter_num
    hk_iter = iter(hk_loader)
    h_iter = iter(h_loader)
    model.train()
    tgt_mask = Transformer.generate_square_subsequent_mask(
        args.trg_max_len - 1, device)

    # validation
    validate(model, tgt_mask, h_valid_loader, hk_valid_loader, rank, logger,
             tb_logger, 0, device)

    for epoch in range(start_epoch + 1, args.epochs + 1):
        while True:
            start = time.time()
            finish_epoch = False
            trans_top5, trans_loss, mask_top5, mask_loss = 0.0, 0.0, 0.0, 0.0

            if args.train_reconstruct:
                optimizer.zero_grad(set_to_none=True)
                for _ in range(args.num_grad_accumulate):
                    try:
                        src_sequences, trg_sequences = next(h_iter)
                    except StopIteration:
                        h_sampler.set_epoch(epoch)
                        h_iter = iter(h_loader)
                        src_sequences, trg_sequences = next(h_iter)

                    trg_sequences = trg_sequences.to(device)
                    src_sequences = src_sequences.to(device)
                    non_pad = trg_sequences != args.pad_idx
                    trg_sequences = trg_sequences[non_pad].contiguous().view(
                        -1)

                    with autocast():
                        predicted = model.module.reconstruct_predict(
                            src_sequences, masked_position=non_pad)
                        predicted = predicted.view(-1, predicted.size(-1))
                        loss = label_smoothing_loss(
                            predicted,
                            trg_sequences) / args.num_grad_accumulate

                    scaler.scale(loss).backward()

                    if global_step % args.print_freq == 0:
                        mask_top5 += accuracy(predicted, trg_sequences,
                                              5) / args.num_grad_accumulate
                        mask_loss += loss.detach().item()

                for param in model.parameters():
                    if param.grad is not None:
                        dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
                        param.grad.data = param.grad.data / world_size

                scaler.step(optimizer)
                scaler.update()

            if args.train_translate:
                optimizer.zero_grad(set_to_none=True)
                for _ in range(args.num_grad_accumulate):
                    try:
                        src_sequences, trg_sequences = next(hk_iter)
                    except StopIteration:
                        hk_sampler.set_epoch(epoch)
                        hk_iter = iter(hk_loader)
                        src_sequences, trg_sequences = next(hk_iter)
                        finish_epoch = True

                    trg_sequences = trg_sequences.to(device)
                    trg_sequences_target = trg_sequences[:, 1:]
                    src_sequences = src_sequences.to(device)
                    non_pad = trg_sequences_target != args.pad_idx
                    trg_sequences_target = trg_sequences_target[
                        non_pad].contiguous().view(-1)

                    with autocast():
                        predicted = model(src_sequences,
                                          trg_sequences[:, :-1],
                                          tgt_mask,
                                          non_pad_position=non_pad)
                        predicted = predicted.view(-1, predicted.size(-1))
                        loss = label_smoothing_loss(
                            predicted,
                            trg_sequences_target) / args.num_grad_accumulate

                    scaler.scale(loss).backward()

                    if global_step % args.print_freq == 0:
                        trans_top5 += accuracy(predicted, trg_sequences_target,
                                               5) / args.num_grad_accumulate
                        trans_loss += loss.detach().item()

                for param in model.parameters():
                    if param.grad is not None:
                        dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
                        param.grad.data = param.grad.data / world_size

                scaler.step(optimizer)
                scaler.update()

            scheduler.step()

            # Print status
            if global_step % args.print_freq == 0:
                if args.train_reconstruct:
                    mask_top5 = torch.cuda.FloatTensor([mask_top5])
                    mask_loss = torch.cuda.FloatTensor([mask_loss])
                    dist.all_reduce(mask_top5, op=dist.ReduceOp.SUM)
                    dist.all_reduce(mask_loss, op=dist.ReduceOp.SUM)
                    mask_top5 = (mask_top5 / world_size).item()
                    mask_loss = (mask_loss / world_size).item()

                if args.train_translate:
                    trans_top5 = torch.cuda.FloatTensor([trans_top5])
                    trans_loss = torch.cuda.FloatTensor([trans_loss])
                    dist.all_reduce(trans_top5, op=dist.ReduceOp.SUM)
                    dist.all_reduce(trans_loss, op=dist.ReduceOp.SUM)
                    trans_top5 = (trans_top5 / world_size).item()
                    trans_loss = (trans_loss / world_size).item()

                if rank == 0:
                    batch_time = time.time() - start
                    write_log(
                        logger,
                        f'[{global_step}/{total_iters}, {epoch}]\tIter time: {batch_time:.3f}\t'
                        f'Trans loss: {trans_loss:.3f}\tMask_loss: {mask_loss:.3f}\t'
                        f'Trans@5: {trans_top5:.3f}\tMask@5: {mask_top5:.3f}')

                    tb_logger.add_scalar('loss/translate', trans_loss,
                                         global_step)
                    tb_logger.add_scalar('loss/mask', mask_loss, global_step)
                    tb_logger.add_scalar('top5/translate', trans_top5,
                                         global_step)
                    tb_logger.add_scalar('top5/mask', mask_top5, global_step)
                    tb_logger.add_scalar('batch/time', batch_time, global_step)
                    tb_logger.add_scalar('batch/lr',
                                         optimizer.param_groups[0]['lr'],
                                         global_step)

            global_step += 1
            if finish_epoch:
                break

        # validation
        validate(model, tgt_mask, h_valid_loader, hk_valid_loader, rank,
                 logger, tb_logger, epoch, device)
        # save model
        if rank == 0:
            torch.save(
                {
                    'epoch': epoch,
                    'model': model.module.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                    'scaler': scaler.state_dict()
                }, f'{args.save_path}/{args.model_name}_ckpt.pt')
            write_log(logger, f"***** {epoch}th model updated! *****")