def test_copying(self): input_texts = ['Turing was born in 1912 in London .'] tag_strs = ['KEEP'] * 8 tags = [tagging.Tag(s) for s in tag_strs] task = tagging.EditingTask(input_texts) self.assertEqual(task.realize_output(tags), input_texts[0]) # With multiple inputs. input_texts = ['a B', 'c D e', 'f g'] tag_strs = ['KEEP'] * 7 tags = [tagging.Tag(s) for s in tag_strs] task = tagging.EditingTask(input_texts) self.assertEqual(task.realize_output(tags), 'a B c D e f g')
def test_casing(self): input_texts = ['A b .', 'Cc dd .'] # Test lowcasing after a period has been removed. tag_strs = ['KEEP', 'KEEP', 'DELETE', 'KEEP', 'KEEP', 'KEEP'] tags = [tagging.Tag(s) for s in tag_strs] task = tagging.EditingTask(input_texts) self.assertEqual(task.realize_output(tags), 'A b cc dd .') # Test upcasing after the first upcased token has been removed. tag_strs = ['KEEP', 'KEEP', 'KEEP', 'DELETE', 'KEEP', 'KEEP'] tags = [tagging.Tag(s) for s in tag_strs] task = tagging.EditingTask(input_texts) self.assertEqual(task.realize_output(tags), 'A b . Dd .')
def compute_tags(self, task, target): """Computes tags needed for converting the source into the target. Args: task: tagging.EditingTask that specifies the input. target: Target text. Returns: List of tagging.Tag objects. If the source couldn't be converted into the target via tagging, returns an empty list. """ target_tokens = utils.get_token_list(target.lower()) tags = self._compute_tags_fixed_order(task.source_tokens, target_tokens) # If conversion fails, try to obtain the target after swapping the source # order. if not tags and len(task.sources) == 2 and self._do_swap: swapped_task = tagging.EditingTask(task.sources[::-1]) tags = self._compute_tags_fixed_order(swapped_task.source_tokens, target_tokens) if tags: tags = (tags[swapped_task.first_tokens[1]:] + tags[:swapped_task.first_tokens[1]]) # We assume that the last token (typically a period) is never deleted, # so we can overwrite the tag_type with SWAP (which keeps the token, # moving it and the sentence it's part of to the end). tags[task.first_tokens[1] - 1].tag_type = tagging.TagType.SWAP return tags
def test_deletion(self): input_texts = ['Turing was born in 1912 in London .'] tag_strs = [ 'KEEP', 'DELETE', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'DELETE' ] tags = [tagging.Tag(s) for s in tag_strs] task = tagging.EditingTask(input_texts) # "was" and "." should have been removed. self.assertEqual(task.realize_output(tags), 'Turing born in 1912 in London')
def test_no_match(self): input_texts = ['Turing was born in 1912 .', 'Turing died in 1954 .'] target = 'Turing was born in 1912 and died in 1954 .' task = tagging.EditingTask(input_texts) phrase_vocabulary = ['but'] converter = tagging_converter.TaggingConverter(phrase_vocabulary) tags = converter.compute_tags(task, target) # Vocabulary doesn't contain "and" so the inputs can't be converted to the # target. self.assertFalse(tags)
def test_phrase_adding(self): input_texts = ['Turing was born in 1912 in London .'] tag_strs = [ 'KEEP', 'DELETE|, a pioneer in TCS ,', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP' ] tags = [tagging.Tag(s) for s in tag_strs] task = tagging.EditingTask(input_texts) self.assertEqual( task.realize_output(tags), 'Turing , a pioneer in TCS , born in 1912 in London .')
def test_swapping_complex(self): input_texts = [ 'Dylan won Nobel prize .', 'Dylan is an American musician .' ] tag_strs = [ 'DELETE', 'KEEP', 'KEEP', 'KEEP', 'SWAP', 'KEEP', 'DELETE|,', 'KEEP', 'KEEP', 'KEEP', 'DELETE|,' ] tags = [tagging.Tag(s) for s in tag_strs] task = tagging.EditingTask(input_texts) self.assertEqual(task.realize_output(tags), 'Dylan , an American musician , won Nobel prize .')
def test_swapping(self): input_texts = [ 'Turing was born in 1912 in London .', 'Turing died in 1954 .' ] tag_strs = [ 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'SWAP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP' ] tags = [tagging.Tag(s) for s in tag_strs] task = tagging.EditingTask(input_texts) self.assertEqual( task.realize_output(tags), 'Turing died in 1954 . Turing was born in 1912 in London .')
def test_invalid_swapping(self): # When SWAP tag is assigned to other than the last token of the first of two # sentences, it should be treated as KEEP. input_texts = [ 'Turing was born in 1912 in London .', 'Turing died in 1954 .' ] tag_strs = [ 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'SWAP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP' ] tags = [tagging.Tag(s) for s in tag_strs] task = tagging.EditingTask(input_texts) self.assertEqual( task.realize_output(tags), 'Turing was born in 1912 in London . Turing died in 1954 .')
def test_realize_output_in_order(self): """ Test for when source tokens occur in the same relative order in the target string """ editing_task = tagging.EditingTask(["word1 word2 <::::> word3 "]) tags_str = ['KEEP|0', 'KEEP|1', 'KEEP|and', 'DELETE', 'KEEP|3'] tags = [tagging.Tag(tag) for tag in tags_str] result = editing_task.realize_output([tags]) expected = "word1 word2 and word3 " self.assertEqual(expected, result)
def test_compute_tags_out_of_order(self): """ Test for when the source tokens do not occur in the same relative order """ dummy_phrase_vocabulary = ['and'] editing_task = tagging.EditingTask([" word1 word2 <::::> word3 "]) converter = TaggingConverter(dummy_phrase_vocabulary) result = [ str(tag) for tag in converter.compute_tags( editing_task, "word2 word1 and word3 ") ] expected = ['KEEP|1', 'KEEP|0', 'KEEP|and', 'DELETE', 'KEEP|3'] self.assertEqual(expected, result)
def test_compute_tags_infeasible(self): """ Test for when the target cannot be constructed by the given edit vocab and source tokens """ dummy_phrase_vocabulary = ['and'] editing_task = tagging.EditingTask([" word1 word2 <::::> word3 "]) converter = TaggingConverter(dummy_phrase_vocabulary) result = [ str(tag) for tag in converter.compute_tags( editing_task, "word2 word1 but word3 ") ] expected = [] self.assertEqual(expected, result)
def _get_embeddings(self, text): """Get BERT embeddings for input text. Args: text: List of input texts. Returns: 4-tuple of input_ids, input_mask, segment_ids, and token_start_indices """ tokens, token_start_indices = self._split_to_wordpieces( tagging.EditingTask(text).source_tokens) tokens = self._truncate_list(tokens) input_tokens = ['[CLS]'] + tokens + ['[SEP]'] input_ids = self._tokenizer.convert_tokens_to_ids(input_tokens) input_mask = [1] * len(input_ids) segment_ids = [0] * len(input_ids) return input_ids, input_mask, segment_ids, token_start_indices
def test_wrong_number_of_tags(self): input_texts = ['1 2'] tags = [tagging.Tag('KEEP')] task = tagging.EditingTask(input_texts) with self.assertRaises(ValueError): task.realize_output(tags)
def test_matching_conversion(self, input_texts, target, phrase_vocabulary, target_tags): task = tagging.EditingTask(input_texts) converter = tagging_converter.TaggingConverter(phrase_vocabulary) tags = converter.compute_tags(task, target) self.assertEqual(tags_to_str(tags), tags_to_str(target_tags))
def build_bert_example( self, sources, target=None, use_arbitrary_target_ids_for_infeasible_examples=False): """Constructs a BERT Example. Args: sources: List of source texts. target: Target text or None when building an example during inference. use_arbitrary_target_ids_for_infeasible_examples: Whether to build an example with arbitrary target ids even if the target can't be obtained via tagging. Returns: BertExample, or None if the conversion from text to tags was infeasible and use_arbitrary_target_ids_for_infeasible_examples == False. """ # Compute target labels. task = tagging.EditingTask(sources) if target is not None: tags = self._converter.compute_tags(task, target) if not tags: if use_arbitrary_target_ids_for_infeasible_examples: # Create a tag sequence [KEEP, DELETE, KEEP, DELETE, ...] which is # unlikely to be predicted by chance. tags = [ tagging.Tag('KEEP') if i % 2 == 0 else tagging.Tag('DELETE') for i, _ in enumerate(task.source_tokens) ] else: return None else: # If target is not provided, we set all target labels to KEEP. tags = [tagging.Tag('KEEP') for _ in task.source_tokens] labels = [self._label_map[str(tag)] for tag in tags] tokens, labels, token_start_indices = self._split_to_wordpieces( task.source_tokens, labels) tokens = self._truncate_list(tokens) labels = self._truncate_list(labels) input_tokens = ['[CLS]'] + tokens + ['[SEP]'] labels_mask = [0] + [1] * len(labels) + [0] labels = [0] + labels + [0] input_ids = self._tokenizer.convert_tokens_to_ids(input_tokens) input_mask = [1] * len(input_ids) segment_ids = [0] * len(input_ids) example = BertExample(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, labels=labels, labels_mask=labels_mask, token_start_indices=token_start_indices, task=task, default_label=self._keep_tag_id) example.pad_to_max_length(self._max_seq_length, self._pad_id) return example