Esempio n. 1
0
class TestBatchBLEU(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tokenizer = BertTokenizer(
            "data/bert-base-multilingual-cased-vocab")

    def test_batch_BLEU(self):
        test_candidates = [
            "The villa was on fire today", "I severely dislike pickles",
            "The quick red fox jumped over the lazy brown dog"
        ]
        test_references = [
            "A house was on fire today", "I hate vinegared cucumbers",
            "A quick red fox hurdled over a sleeping brown dog"
        ]

        seq_len = 100

        # generate samples using known implementation
        batch_candidates = torch.zeros(len(test_candidates),
                                       seq_len,
                                       dtype=torch.long)
        batch_references = torch.zeros_like(batch_candidates)
        candidate_mask = torch.zeros_like(batch_candidates)
        reference_mask = torch.zeros_like(batch_candidates)
        reference_scores = torch.zeros(len(test_candidates))

        for i, candidate, reference in zip(range(len(test_candidates)),
                                           test_candidates, test_references):
            candidate_ids = self.tokenizer.tokenize_and_convert_to_ids(
                candidate)
            reference_ids = self.tokenizer.tokenize_and_convert_to_ids(
                reference)

            batch_candidates[i, :len(candidate_ids)] = candidate_ids
            batch_references[i, :len(reference_ids)] = reference_ids
            candidate_mask[i, :len(candidate_ids)] = 1
            reference_mask[i, :len(reference_ids)] = 1
            reference_scores[i] = bleu_score([reference_ids], candidate_ids)

        # compare scores between known and own implementation
        batch_scores = batch_BLEU(batch_candidates, batch_references,
                                  candidate_mask, reference_mask)
        for reference_score, batch_score in zip(reference_scores,
                                                batch_scores):
            self.assertAlmostEqual(reference_score, batch_score)
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.batch_size = 8
     self.max_seq_len = 256
     self.ltoi = {'ar': 0, 'bg': 1, 'de': 2, 'en': 3}
     self.num_workers = 8
     self.tokenizer = BertTokenizer(
         'data/bert-base-multilingual-cased-vocab.txt')
Esempio n. 3
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tokenizer = BertTokenizer(
            "data/bert-base-multilingual-cased-vocab.txt")
        self.model = mock.MagicMock()
        self.model.__call__.return_value = [
            # return set of predicted ids
        ]

        # implement methods needed for mock
        self.languages = ['en']
        self.evaluator = EvaluateXNLI(self.model, self.tokenizer,
                                      self.languages)
Esempio n. 4
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.tokenizer = BertTokenizer(
         "data/bert-base-multilingual-cased-vocab")
Esempio n. 5
0
from torch.utils.data import DataLoader
from dataset import ParallelDataset, BertTokenizer

print("Running unittests for XNLI dataset...")
batch_size = 12
seq_len = 128

tokenizer = BertTokenizer("data/bert-base-multilingual-uncased-vocab.txt")
dataset = ParallelDataset("data/xnli.15way.orig.tsv", tokenizer, seq_len)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print("passed initialization and dataloader tests")

for i, batch in enumerate(data_loader):
    assert type(batch) is dict
    assert len(batch.keys()) == 15
    assert batch['en'].shape == (batch_size, seq_len)
    if i > 10:
        break

print("passed batch sampling tests")

languages = ('vi', 'en')
tokenizer = BertTokenizer("data/bert-base-multilingual-uncased-vocab.txt")
dataset = ParallelDataset("data/xnli.15way.orig.tsv",
                          tokenizer,
                          seq_len,
                          languages=languages)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print("passed initialization of subset of columns")
	def __init__(self, *args, **kwargs):
		super().__init__(*args, **kwargs)
		self.batch_size = 16
		self.max_seq_len = 256
		self.num_workers = 8
		self.tokenizer = BertTokenizer('data/bert-base-multilingual-cased-vocab.txt')
Esempio n. 7
0
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c", "--train_dataset", type=str, default="./dataset/corpus/train.txt", help="train dataset for train bert")
    parser.add_argument("-t", "--test_dataset", type=str, default="./dataset/corpus/test.txt", help="test set for evaluate train set")
    #parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab")
    parser.add_argument("-o", "--output_path", type=str, default="./output/bert.model",  help="ex)output/bert.model")

    parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model")
    parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers")
    parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads")
    parser.add_argument("-s", "--seq_len", type=int, default=512, help="maximum sequence len")

    parser.add_argument("-b", "--batch_size", type=int, default=8, help="number of batch_size")
    parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs")
    parser.add_argument("-w", "--num_workers", type=int, default=1, help="dataloader worker size")

    parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false")
    parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n")
    parser.add_argument("--corpus_lines", type=int, default=5110, help="total number of lines in corpus")
    parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
    parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")

    parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam")
    parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value")

    args = parser.parse_args()

    print("Loading Vocab")
    tokenizer = BertTokenizer("./dataset/corpus")
    vocab_size = tokenizer.get_vocab_size()
    print("Vocab Size: ", vocab_size)

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset, tokenizer, seq_len=args.seq_len,
                                corpus_lines=args.corpus_lines, on_memory=args.on_memory)

    print("Loading Test Dataset", args.test_dataset)
    test_dataset = BERTDataset(args.test_dataset, tokenizer, seq_len=args.seq_len, on_memory=args.on_memory) \
        if args.test_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
        if test_dataset is not None else None

    print("Building BERT model")
    bert = BERT(vocab_size, tokenizer.pad_index, hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads)

    print("Creating BERT Trainer")
    trainer = BERTTrainer(bert, vocab_size, train_dataloader=train_data_loader, test_dataloader=test_data_loader,
                          lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
                          with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq)

    print("Training Start")
    for epoch in range(args.epochs):
        trainer.train(epoch)
        trainer.save(epoch, args.output_path)

        if test_data_loader is not None:
            trainer.test(epoch)