def setUp(self): # construct dummy dictionary vocab_size = 10 d = test_utils.dummy_dictionary(vocab_size=vocab_size) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) self.eos = d.eos() self.w1 = 4 self.w2 = 5 self.beam_size = 3 # construct prefix data self.tokens = torch.LongTensor([ [self.w1, self.w2, self.eos], ]) self.token_lengths = torch.LongTensor([2]) args = argparse.Namespace() unk = 0.0 args.beam_probs = [ # prefix step 0: torch.FloatTensor([ # eos [0.0, unk] + [1.0 / vocab_size] * vocab_size # beam 1 ] * self.beam_size), ] * vocab_size task = test_utils.TestTranslationTask.setup_task(args, d, d) self.model = task.build_model(args) self.tgt_dict = task.target_dictionary
def setUp(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) self.eos = d.eos() self.w1 = 4 self.w2 = 5 # construct source data self.src_tokens = torch.LongTensor( [[self.w1, self.w2, self.eos], [self.w1, self.w2, self.eos],] ) self.src_lengths = torch.LongTensor([2, 2]) args = argparse.Namespace() unk = 0.0 args.beam_probs = [ # step 0: torch.FloatTensor( [ # eos w1 w2 # sentence 1: [0.0, unk, 0.9, 0.1], # beam 1 [0.0, unk, 0.9, 0.1], # beam 2 # sentence 2: [0.0, unk, 0.7, 0.3], [0.0, unk, 0.7, 0.3], ] ), # step 1: torch.FloatTensor( [ # eos w1 w2 # sentence 1: [0.0, unk, 0.6, 0.4], [0.0, unk, 0.6, 0.4], # sentence 2: [0.25, unk, 0.35, 0.4], [0.25, unk, 0.35, 0.4], ] ), # step 2: torch.FloatTensor( [ # eos w1 w2 # sentence 1: [1.0, unk, 0.0, 0.0], [1.0, unk, 0.0, 0.0], # sentence 2: [0.9, unk, 0.1, 0.0], [0.9, unk, 0.1, 0.0], ] ), ] task = test_utils.TestTranslationTask.setup_task(args, d, d) self.model = task.build_model(args) self.tgt_dict = task.target_dictionary
def setUp(self): # build dictionary self.d = test_utils.dummy_dictionary(3) vocab = len(self.d) self.assertEqual(vocab, 4 + 3) # 4 special + 3 tokens self.assertEqual(self.d.pad(), 1) self.assertEqual(self.d.eos(), 2) self.assertEqual(self.d.unk(), 3) pad, eos, unk, w1, w2, w3 = 1, 2, 3, 4, 5, 6 # noqa: F841 # build dataset self.data = [ # the first batch item has padding { 'source': torch.LongTensor([w1, eos]), 'target': torch.LongTensor([w1, eos]) }, { 'source': torch.LongTensor([w1, eos]), 'target': torch.LongTensor([w1, w1, eos]) }, ] self.sample = next(test_utils.dummy_dataloader(self.data)) # build model self.args = argparse.Namespace() self.args.sentence_avg = False self.args.probs = torch.FloatTensor([ # pad eos unk w1 w2 w3 [0.05, 0.05, 0.1, 0.05, 0.3, 0.4, 0.05], [0.05, 0.10, 0.2, 0.05, 0.2, 0.3, 0.10], [0.05, 0.15, 0.3, 0.05, 0.1, 0.2, 0.15], ]).unsqueeze(0).expand(2, 3, 7) # add batch dimension self.model = test_utils.TestModel.build_model(self.args, self.d, self.d)
def setUp(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) self.eos = d.eos() self.w1 = 4 self.w2 = 5 # construct source data self.src_tokens = torch.LongTensor([ [self.w1, self.w2, self.eos], [self.w1, self.w2, self.eos], ]) self.src_lengths = torch.LongTensor([2, 2]) args = argparse.Namespace() unk = 0. # The minimal probability of top 2 tokens. self.min_top2_prob = 0.75 # The minimal probability of the top 1 token. self.min_top1_prob = 0.4 w1_prob = self.min_top1_prob w2_prob = self.min_top2_prob - self.min_top1_prob eos_prob = 1 - self.min_top2_prob args.beam_probs = [ # step 0: torch.FloatTensor([ # eos w1 w2 [0.0, unk, 1.0, 0.0], [0.0, unk, 1.0, 0.0], [0.0, unk, 1.0, 0.0], [0.0, unk, 1.0, 0.0], ]), # step 1: torch.FloatTensor([ # eos w1 w2 [eos_prob, unk, w1_prob, w2_prob], [eos_prob, unk, w1_prob, w2_prob], [eos_prob, unk, w1_prob, w2_prob], [eos_prob, unk, w1_prob, w2_prob], ]), # step 2: torch.FloatTensor([ # eos w1 w2 [1.0, unk, 0.0, 0.0], [1.0, unk, 0.0, 0.0], [1.0, unk, 0.0, 0.0], [1.0, unk, 0.0, 0.0], ]), ] task = test_utils.TestTranslationTask.setup_task(args, d, d) self.model = task.build_model(args) self.tgt_dict = task.target_dictionary
def setUp(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) self.eos = d.eos() self.w1 = 4 self.w2 = 5 # construct source data self.src_tokens = torch.LongTensor([ [self.w1, self.w2, self.eos], [self.w1, self.w2, self.eos], ]) self.src_lengths = torch.LongTensor([2, 2]) args = argparse.Namespace() unk = 0. args.beam_probs = [ # step 0: torch.FloatTensor([ # eos w1 w2 # sentence 1: [0.0, unk, 0.9, 0.1], # beam 1 [0.0, unk, 0.9, 0.1], # beam 2 # sentence 2: [0.0, unk, 0.7, 0.3], [0.0, unk, 0.7, 0.3], ]), # step 1: torch.FloatTensor([ # eos w1 w2 # sentence 1: [0.0, unk, 0.6, 0.4], [0.0, unk, 0.6, 0.4], # sentence 2: [0.25, unk, 0.35, 0.4], [0.25, unk, 0.35, 0.4], ]), # step 2: torch.FloatTensor([ # eos w1 w2 # sentence 1: [1.0, unk, 0.0, 0.0], [1.0, unk, 0.0, 0.0], # sentence 2: [0.9, unk, 0.1, 0.0], [0.9, unk, 0.1, 0.0], ]), ] task = test_utils.TestTranslationTask.setup_task(args, d, d) self.model = task.build_model(args) self.tgt_dict = task.target_dictionary
def test_eval_dataloader(self): dictionary = test_utils.dummy_dictionary(10) assert len(dictionary) == 14 # 4 extra special symbols assert dictionary.pad() == 1 dataset = test_utils.TestDataset([ torch.tensor([4, 5, 6, 7], dtype=torch.long), torch.tensor([8, 9, 10, 11], dtype=torch.long), torch.tensor([12, 13], dtype=torch.long), ]) dataset = MonolingualDataset(dataset, sizes=[4, 4, 2], src_vocab=dictionary) config = LanguageModelingConfig(tokens_per_sample=4) task = LanguageModelingTask(config, dictionary) eval_dataloader = task.eval_lm_dataloader( dataset=dataset, batch_size=1, context_window=2, num_workers=0, ) batch = next(eval_dataloader) assert batch["net_input"]["src_tokens"][0].tolist() == [ 4, 5, 6, 7, 1, 1 ] assert batch["target"][0].tolist() == [4, 5, 6, 7, 1, 1] batch = next(eval_dataloader) assert batch["net_input"]["src_tokens"][0].tolist() == [ 6, 7, 8, 9, 10, 11 ] assert batch["target"][0].tolist() == [1, 1, 8, 9, 10, 11] batch = next(eval_dataloader) assert batch["net_input"]["src_tokens"][0].tolist() == [10, 11, 12, 13] assert batch["target"][0].tolist() == [1, 1, 12, 13]
def setUp(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) self.eos = d.eos() self.w1 = 4 self.w2 = 5 # construct source data self.src_tokens = Variable( torch.LongTensor([ [self.w1, self.w2, self.eos], [self.w1, self.w2, self.eos], ])) self.src_lengths = Variable(torch.LongTensor([2, 2])) args = argparse.Namespace() unk = 0. args.beam_probs = [ # step 0: torch.FloatTensor([ # eos w1 w2 # sentence 1: [0.0, unk, 0.9, 0.1], # beam 1 [0.0, unk, 0.9, 0.1], # beam 2 # sentence 2: [0.0, unk, 0.7, 0.3], [0.0, unk, 0.7, 0.3], ]), # step 1: torch.FloatTensor([ # eos w1 w2 prefix # sentence 1: [1.0, unk, 0.0, 0.0], # w1: 0.9 (emit: w1 <eos>: 0.9*1.0) [0.0, unk, 0.9, 0.1], # w2: 0.1 # sentence 2: [0.25, unk, 0.35, 0.4], # w1: 0.7 (don't emit: w1 <eos>: 0.7*0.25) [0.00, unk, 0.10, 0.9], # w2: 0.3 ]), # step 2: torch.FloatTensor([ # eos w1 w2 prefix # sentence 1: [0.0, unk, 0.1, 0.9], # w2 w1: 0.1*0.9 [0.6, unk, 0.2, 0.2], # w2 w2: 0.1*0.1 (emit: w2 w2 <eos>: 0.1*0.1*0.6) # sentence 2: [0.60, unk, 0.4, 0.00], # w1 w2: 0.7*0.4 (emit: w1 w2 <eos>: 0.7*0.4*0.6) [0.01, unk, 0.0, 0.99], # w2 w2: 0.3*0.9 ]), # step 3: torch.FloatTensor([ # eos w1 w2 prefix # sentence 1: [ 1.0, unk, 0.0, 0.0 ], # w2 w1 w2: 0.1*0.9*0.9 (emit: w2 w1 w2 <eos>: 0.1*0.9*0.9*1.0) [ 1.0, unk, 0.0, 0.0 ], # w2 w1 w1: 0.1*0.9*0.1 (emit: w2 w1 w1 <eos>: 0.1*0.9*0.1*1.0) # sentence 2: [ 0.1, unk, 0.5, 0.4 ], # w2 w2 w2: 0.3*0.9*0.99 (emit: w2 w2 w2 <eos>: 0.3*0.9*0.99*0.1) [ 1.0, unk, 0.0, 0.0 ], # w1 w2 w1: 0.7*0.4*0.4 (emit: w1 w2 w1 <eos>: 0.7*0.4*0.4*1.0) ]), ] self.model = test_utils.TestModel.build_model(args, d, d)
def setUp(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) self.eos = d.eos() self.w1 = 4 self.w2 = 5 # construct source data self.src_tokens = torch.LongTensor([ [self.w1, self.w2, self.eos], [self.w1, self.w2, self.eos], ]) self.src_lengths = torch.LongTensor([2, 2]) args = argparse.Namespace() unk = 0. args.beam_probs = [ # step 0: torch.FloatTensor([ # eos w1 w2 # sentence 1: [0.0, unk, 0.9, 0.1], # beam 1 [0.0, unk, 0.9, 0.1], # beam 2 # sentence 2: [0.0, unk, 0.7, 0.3], [0.0, unk, 0.7, 0.3], ]), # step 1: torch.FloatTensor([ # eos w1 w2 prefix # sentence 1: [1.0, unk, 0.0, 0.0], # w1: 0.9 (emit: w1 <eos>: 0.9*1.0) [0.0, unk, 0.9, 0.1], # w2: 0.1 # sentence 2: [0.25, unk, 0.35, 0.4], # w1: 0.7 (don't emit: w1 <eos>: 0.7*0.25) [0.00, unk, 0.10, 0.9], # w2: 0.3 ]), # step 2: torch.FloatTensor([ # eos w1 w2 prefix # sentence 1: [0.0, unk, 0.1, 0.9], # w2 w1: 0.1*0.9 [0.6, unk, 0.2, 0.2], # w2 w2: 0.1*0.1 (emit: w2 w2 <eos>: 0.1*0.1*0.6) # sentence 2: [0.60, unk, 0.4, 0.00], # w1 w2: 0.7*0.4 (emit: w1 w2 <eos>: 0.7*0.4*0.6) [0.01, unk, 0.0, 0.99], # w2 w2: 0.3*0.9 ]), # step 3: torch.FloatTensor([ # eos w1 w2 prefix # sentence 1: [1.0, unk, 0.0, 0.0], # w2 w1 w2: 0.1*0.9*0.9 (emit: w2 w1 w2 <eos>: 0.1*0.9*0.9*1.0) [1.0, unk, 0.0, 0.0], # w2 w1 w1: 0.1*0.9*0.1 (emit: w2 w1 w1 <eos>: 0.1*0.9*0.1*1.0) # sentence 2: [0.1, unk, 0.5, 0.4], # w2 w2 w2: 0.3*0.9*0.99 (emit: w2 w2 w2 <eos>: 0.3*0.9*0.99*0.1) [1.0, unk, 0.0, 0.0], # w1 w2 w1: 0.7*0.4*0.4 (emit: w1 w2 w1 <eos>: 0.7*0.4*0.4*1.0) ]), ] task = test_utils.TestTranslationTask.setup_task(args, d, d) self.model = task.build_model(args) self.tgt_dict = task.target_dictionary
def __init__(self, args): super().__init__(args) self.dictionary = dummy_dictionary(VOCAB_SIZE - 4) assert len(self.dictionary) == VOCAB_SIZE
def test_sequence_scorer(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) eos = d.eos() w1 = 4 w2 = 5 # construct dataloader data = [ { "source": torch.LongTensor([w1, w2, eos]), "target": torch.LongTensor([w1, w2, w1, eos]), }, { "source": torch.LongTensor([w2, eos]), "target": torch.LongTensor([w2, w1, eos]), }, { "source": torch.LongTensor([w2, eos]), "target": torch.LongTensor([w2, eos]), }, ] data_itr = test_utils.dummy_dataloader(data) # specify expected output probabilities args = argparse.Namespace() unk = 0.0 args.beam_probs = [ # step 0: torch.FloatTensor([ # eos w1 w2 [0.0, unk, 0.6, 0.4], # sentence 1 [0.0, unk, 0.4, 0.6], # sentence 2 [0.0, unk, 0.7, 0.3], # sentence 3 ]), # step 1: torch.FloatTensor([ # eos w1 w2 [0.0, unk, 0.2, 0.7], # sentence 1 [0.0, unk, 0.8, 0.2], # sentence 2 [0.7, unk, 0.1, 0.2], # sentence 3 ]), # step 2: torch.FloatTensor([ # eos w1 w2 [0.10, unk, 0.50, 0.4], # sentence 1 [0.15, unk, 0.15, 0.7], # sentence 2 [0.00, unk, 0.00, 0.0], # sentence 3 ]), # step 3: torch.FloatTensor([ # eos w1 w2 [0.9, unk, 0.05, 0.05], # sentence 1 [0.0, unk, 0.00, 0.0], # sentence 2 [0.0, unk, 0.00, 0.0], # sentence 3 ]), ] expected_scores = [ [0.6, 0.7, 0.5, 0.9], # sentence 1 [0.6, 0.8, 0.15], # sentence 2 [0.3, 0.7], # sentence 3 ] task = test_utils.TestTranslationTask.setup_task(args, d, d) model = task.build_model(args) scorer = SequenceScorer(task.target_dictionary) for sample in data_itr: hypos = task.inference_step(scorer, [model], sample) for id, hypos_id in zip(sample["id"].tolist(), hypos): self.assertHypoTokens(hypos_id[0], data[id]["target"]) self.assertHypoScore(hypos_id[0], expected_scores[id])
def test_sequence_scorer(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) eos = d.eos() w1 = 4 w2 = 5 # construct dataloader data = [ { 'source': torch.LongTensor([w1, w2, eos]), 'target': torch.LongTensor([w1, w2, w1, eos]), }, { 'source': torch.LongTensor([w2, eos]), 'target': torch.LongTensor([w2, w1, eos]), }, { 'source': torch.LongTensor([w2, eos]), 'target': torch.LongTensor([w2, eos]), }, ] data_itr = test_utils.dummy_dataloader(data) # specify expected output probabilities args = argparse.Namespace() unk = 0. args.beam_probs = [ # step 0: torch.FloatTensor([ # eos w1 w2 [0.0, unk, 0.6, 0.4], # sentence 1 [0.0, unk, 0.4, 0.6], # sentence 2 [0.0, unk, 0.7, 0.3], # sentence 3 ]), # step 1: torch.FloatTensor([ # eos w1 w2 [0.0, unk, 0.2, 0.7], # sentence 1 [0.0, unk, 0.8, 0.2], # sentence 2 [0.7, unk, 0.1, 0.2], # sentence 3 ]), # step 2: torch.FloatTensor([ # eos w1 w2 [0.10, unk, 0.50, 0.4], # sentence 1 [0.15, unk, 0.15, 0.7], # sentence 2 [0.00, unk, 0.00, 0.0], # sentence 3 ]), # step 3: torch.FloatTensor([ # eos w1 w2 [0.9, unk, 0.05, 0.05], # sentence 1 [0.0, unk, 0.00, 0.0], # sentence 2 [0.0, unk, 0.00, 0.0], # sentence 3 ]), ] expected_scores = [ [0.6, 0.7, 0.5, 0.9], # sentence 1 [0.6, 0.8, 0.15], # sentence 2 [0.3, 0.7], # sentence 3 ] model = test_utils.TestModel.build_model(args, d, d) scorer = SequenceScorer([model]) for id, _src, _ref, hypos in scorer.score_batched_itr(data_itr): self.assertHypoTokens(hypos[0], data[id]['target']) self.assertHypoScore(hypos[0], expected_scores[id])
def test_sequence_scorer(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) eos = d.eos() w1 = 4 w2 = 5 # construct dataloader data = [ { 'source': torch.LongTensor([w1, w2, eos]), 'target': torch.LongTensor([w1, w2, w1, eos]), }, { 'source': torch.LongTensor([w2, eos]), 'target': torch.LongTensor([w2, w1, eos]), }, { 'source': torch.LongTensor([w2, eos]), 'target': torch.LongTensor([w2, eos]), }, ] data_itr = test_utils.dummy_dataloader(data) # specify expected output probabilities args = argparse.Namespace() unk = 0. args.beam_probs = [ # step 0: torch.FloatTensor([ # eos w1 w2 [0.0, unk, 0.6, 0.4], # sentence 1 [0.0, unk, 0.4, 0.6], # sentence 2 [0.0, unk, 0.7, 0.3], # sentence 3 ]), # step 1: torch.FloatTensor([ # eos w1 w2 [0.0, unk, 0.2, 0.7], # sentence 1 [0.0, unk, 0.8, 0.2], # sentence 2 [0.7, unk, 0.1, 0.2], # sentence 3 ]), # step 2: torch.FloatTensor([ # eos w1 w2 [0.10, unk, 0.50, 0.4], # sentence 1 [0.15, unk, 0.15, 0.7], # sentence 2 [0.00, unk, 0.00, 0.0], # sentence 3 ]), # step 3: torch.FloatTensor([ # eos w1 w2 [0.9, unk, 0.05, 0.05], # sentence 1 [0.0, unk, 0.00, 0.0], # sentence 2 [0.0, unk, 0.00, 0.0], # sentence 3 ]), ] expected_scores = [ [0.6, 0.7, 0.5, 0.9], # sentence 1 [0.6, 0.8, 0.15], # sentence 2 [0.3, 0.7], # sentence 3 ] task = test_utils.TestTranslationTask.setup_task(args, d, d) model = task.build_model(args) scorer = SequenceScorer([model], task.target_dictionary) for id, _src, _ref, hypos in scorer.score_batched_itr(data_itr): self.assertHypoTokens(hypos[0], data[id]['target']) self.assertHypoScore(hypos[0], expected_scores[id])