def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("-m", "--model_path", required=False, type=str, help="model of pretrain",default='../output/model_bert/bert_ep2.model')
    parser.add_argument("-v", "--vocab_path", required=False, type=str, help="path of vocab",default='../data/vocab.test')
    args = parser.parse_args()

    model_path = args.model_path
    vocab_path = args.vocab_path

    vocab = WordVocab.load_vocab(vocab_path)

    model = torch.load(model_path)
    model.eval()

    sent = '嗯 不好意思 没有 时间'.split(hp.split_mark)

    text = '嗯 不好意思 没有 时间'
    sent1, label = random_word(text, vocab)
    sent1 = torch.tensor(sent1).long().unsqueeze(0)
    #mask_lm_output, attn_list = model.forward(sent1)

    # chars = []
    # for char in sent:
    #     chars.append(vocab.char2index(char))

    for layer in range(3):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Layer", layer+1)
        for h in range(4):
            # a = model.bert.layers[layer].multihead.attention[0,h].data
            draw(model.bert.layers[layer].multihead.attention[0, h].data, #[0, h].data,
                 sent, sent if h == 0 else [], ax=axs[h])
        plt.show()
def train():
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

    parser = argparse.ArgumentParser()

    parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert")
    parser.add_argument("-t", "--valid_dataset", required=True, type=str, help="valid set for evaluate train set")
    parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with vocab")
    parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model")
    parser.add_argument("-m", "--model_path", required=True, type=str, help="Path of exist mlm model")

    parser.add_argument("-w", "--num_workers", type=int, default=1, help="dataloader worker size")
    parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false")
    parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
    parser.add_argument("--cuda_devices", type=int, nargs='+', default=[0, 1, 2, 3], help="CUDA device ids")
    parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false")
    parser.add_argument('--mode', type=str, default='train', help="train or test")
    parser.add_argument('--seed', type=int, default=3431, help="random seed for initialization")

    args = parser.parse_args()
    set_seed(args)
    paths = Paths(args.output_path)
    mode = args.mode

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", vocab.vocab_size)
    args.char_nums = vocab.vocab_size

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset, vocab,  corpus_lines=args.corpus_lines, on_memory=args.on_memory, train=False)

    print("Loading Valid Dataset", args.valid_dataset)
    valid_dataset = BERTDataset(args.valid_dataset, vocab, on_memory=args.on_memory, train=False) \
        if args.valid_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch),num_workers=args.num_workers, shuffle=True)
    valid_data_loader = DataLoader(valid_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch), num_workers=args.num_workers, shuffle=True) \
        if valid_dataset is not None else None

    print("Load BERT model")
    # bert = BERT(embed_dim=hp.embed_dim, hidden=hp.hidden, args=args)
    bert = torch.load(args.model_path)
    print("Creating BERT Trainer")
    global_step = 0
    trainer = BERTTrainer(bert, vocab.vocab_size, train_dataloader=train_data_loader, test_dataloader=valid_data_loader,
                          with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, args=args, global_step=global_step, path=paths)

    print("Training Start")

    if mode == 'train':
        trainer.train()

    if mode == 'eval':
        trainer.eval()
Exemple #3
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("-m",
                        "--model_path",
                        required=True,
                        type=str,
                        help="model of pretrain")
    parser.add_argument("-v",
                        "--vocab_path",
                        required=True,
                        type=str,
                        help="path of vocab")
    args = parser.parse_args()

    model_path = args.model_path
    vocab_path = args.vocab_path

    vocab = WordVocab.load_vocab(vocab_path)

    model = torch.load(model_path)
    model.eval()

    sent = '_I _l _o _v _e _C _h _i _n _a _!'.split()

    text = 'I love China!'
    sent1, label = random_word(text, vocab)
    sent1 = torch.tensor(sent1).long().unsqueeze(0)
    mask_lm_output, attn_list = model.forward(sent1)

    chars = []
    for char in sent:
        chars.append(vocab.char2index(char))

    for layer in range(3):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Layer", layer + 1)
        for h in range(4):
            # a = model.bert.layers[layer].multihead.attention[0,h].data
            draw(
                model.bert.layers[layer].multihead.attention[
                    0, h].data,  #[0, h].data,
                sent,
                sent if h == 0 else [],
                ax=axs[h])
        plt.show()
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--train_dataset",
                        required=True,
                        type=str,
                        help="train dataset for train bert")
    parser.add_argument("-t",
                        "--test_dataset",
                        type=str,
                        default=None,
                        help="test set for evaluate train set")
    parser.add_argument("-v",
                        "--vocab_path",
                        required=True,
                        type=str,
                        help="built vocab model path with bert-vocab")
    parser.add_argument("-o",
                        "--output_path",
                        required=True,
                        type=str,
                        help="ex)output/bert.model")

    parser.add_argument("-hs",
                        "--hidden",
                        type=int,
                        default=256,
                        help="hidden size of transformer model")
    parser.add_argument("-l",
                        "--layers",
                        type=int,
                        default=8,
                        help="number of layers")
    parser.add_argument("-a",
                        "--attn_heads",
                        type=int,
                        default=8,
                        help="number of attention heads")
    parser.add_argument("-s",
                        "--seq_len",
                        type=int,
                        default=20,
                        help="maximum sequence len")

    parser.add_argument("-b",
                        "--batch_size",
                        type=int,
                        default=64,
                        help="number of batch_size")
    parser.add_argument("-e",
                        "--epochs",
                        type=int,
                        default=10,
                        help="number of epochs")
    parser.add_argument("-w",
                        "--num_workers",
                        type=int,
                        default=4,
                        help="dataloader worker size")

    parser.add_argument("--with_cuda",
                        type=bool,
                        default=True,
                        help="training with CUDA: true, or false")
    parser.add_argument("--log_freq",
                        type=int,
                        default=10,
                        help="printing loss every n iter: setting n")
    parser.add_argument("--corpus_lines",
                        type=int,
                        default=None,
                        help="total number of lines in corpus")
    parser.add_argument("--cuda_devices",
                        type=int,
                        nargs='+',
                        default=None,
                        help="CUDA device ids")
    parser.add_argument("--on_memory",
                        action='store_true',
                        help="Loading on memory: true or false")

    parser.add_argument("--lr",
                        type=float,
                        default=1e-3,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.9,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")

    args = parser.parse_args()
    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset,
                                vocab,
                                seq_len=args.seq_len,
                                corpus_lines=args.corpus_lines,
                                on_memory=args.on_memory)

    print("Loading Test Dataset", args.test_dataset)
    test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \
        if args.test_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
        if test_dataset is not None else None

    print("Building BERT model")
    bert = BERT(len(vocab),
                hidden=args.hidden,
                n_layers=args.layers,
                attn_heads=args.attn_heads)

    print("Creating BERT Trainer")
    trainer = BERTTrainer(bert,
                          len(vocab),
                          train_dataloader=train_data_loader,
                          test_dataloader=test_data_loader,
                          lr=args.lr,
                          betas=(args.adam_beta1, args.adam_beta2),
                          weight_decay=args.adam_weight_decay,
                          with_cuda=args.with_cuda,
                          cuda_devices=args.cuda_devices,
                          log_freq=args.log_freq)

    print("Training Start")
    for epoch in range(args.epochs):
        trainer.train(epoch)
        trainer.save(epoch, args.output_path)

        if test_data_loader is not None:
            trainer.test(epoch)
Exemple #5
0
                    default=None,
                    type=str,
                    required=True,
                    help="model path.")

args = parser.parse_args()

test_set = []
with open(args.test_corpus, 'r') as csvfile:
    csv_reader = csv.reader(csvfile,
                            delimiter='\t',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
    for row in csv_reader:
        test_set.append([row[1], int(row[0])])

vocab = WordVocab.load_vocab(args.vocab)

sea_test, _ = tensor_generate(test_set)

seq_list, label_list, target_list = data_generate(test_set)
test_dataset = TensorDataset(seq_list, label_list, sea_test, target_list)

classifiy_model = torch.load(args.model_path).cuda()
model_out, true_out = test_model_out(test_dataset, classifiy_model,
                                     args.batch_size)
print('Recall:' + str(recall_score(true_out, model_out)))
print('Precision:' + str(precision_score(true_out, model_out)))
print('F1:' + str(f1_score(true_out, model_out)))
print('Accuracy:' + str(accuracy_score(true_out, model_out)))
import os
import sys
path = os.path.abspath('.')
if path not in sys.path: sys.path.append(path)
from dataset.vocab import WordVocab
from utils import *

if __name__ == "__main__":

    with open("data/corpus.txt", "r", encoding='utf-8') as f:
        vocab = WordVocab(f, min_freq=1)

    to_pkl(vocab, "data/vocab.pkl")
    print("vocab len:", len(vocab))