def setUp(self): super(UtilsTest, self).setUp() vocab_tokens = [ 'NOTHING', '[CLS]', '[SEP]', '[MASK]', '[PAD]', 'a', 'b', 'c', '##d', 'd', '##e' ] vocab_file = self.create_tempfile() vocab_file.write_text(''.join([x + '\n' for x in vocab_tokens])) self._tokenizer = tokenization.FullTokenizer(vocab_file.full_path, do_lower_case=True)
def __init__(self, label_map, max_seq_length, do_lower_case, converter, use_open_vocab, vocab_file=None, converter_insertion=None, special_glue_string_for_sources=None): """Initializes an instance of BertExampleBuilder. Args: label_map: Mapping from tags to tag IDs. max_seq_length: Maximum sequence length. do_lower_case: Whether to lower case the input text. Should be True for uncased models and False for cased models. converter: Converter from text targets to points. use_open_vocab: Should MASK be inserted or phrases. Currently only True is supported. vocab_file: Path to BERT vocabulary file. converter_insertion: Converter for building an insertion example based on the tagger output. Optional. special_glue_string_for_sources: If there are multiple sources, this string is used to combine them into one string. The empty string is a valid value. Optional. """ self.label_map = label_map inverse_label_map = {} for label, label_id in label_map.items(): if label_id in inverse_label_map: raise ValueError( 'Multiple labels with the same ID: {}'.format(label_id)) inverse_label_map[label_id] = label self._inverse_label_map = frozendict.frozendict(inverse_label_map) self.tokenizer = tokenization.FullTokenizer( vocab_file, do_lower_case=do_lower_case) self._max_seq_length = max_seq_length self._converter = converter self._pad_id = self._get_pad_id() self._do_lower_case = do_lower_case self._use_open_vocab = use_open_vocab self._converter_insertion = converter_insertion if special_glue_string_for_sources is not None: self._special_glue_string_for_sources = special_glue_string_for_sources else: self._special_glue_string_for_sources = ' '
def __init__(self, label_map, vocab_file, do_lower_case, max_seq_length, max_predictions_per_seq, max_insertions_per_token, insert_after_token = True, special_glue_string_for_sources = None): """Initializes an instance of BertExampleBuilder. Args: label_map: Mapping from tags represented as (base_tag, num_insertions) tuples to tag IDs. vocab_file: Path to vocab file. do_lower_case: should text be lowercased. max_seq_length: Maximum sequence length. max_predictions_per_seq: Maximum number of tokens to insert per input. max_insertions_per_token: Maximum number of tokens/masks to insert per token. insert_after_token: Whether to insert tokens after the current token. special_glue_string_for_sources: If there are multiple sources, this string is used to combine them into one string. The empty string is a valid value. Optional. """ self.label_map = label_map self.tokenizer = tokenization.FullTokenizer( vocab_file, do_lower_case=do_lower_case) self._max_seq_length = max_seq_length self._max_predictions_per_seq = max_predictions_per_seq self._max_insertions_per_token = max_insertions_per_token self._insert_after_token = insert_after_token try: self._pad_id = self.tokenizer.convert_tokens_to_ids([constants.PAD])[0] except KeyError: self._pad_id = 0 if special_glue_string_for_sources is not None: self._special_glue_string_for_sources = special_glue_string_for_sources else: self._special_glue_string_for_sources = ' '
def __init__(self, max_seq_length, max_predictions_per_seq, label_map, vocab_file=None, do_lower_case=True, fall_back_mode='random'): """Initializes an instance of InsertionConverter. Args: max_seq_length: Maximum length of source sequence. max_predictions_per_seq: Maximum number of MASK tokens. label_map: Dictionary to convert labels_ids to labels. vocab_file: Path to BERT vocabulary file. do_lower_case: text is lowercased. fall_back_mode: In the case no MASK tokens are generated: 'random': Randomly add MASK tokens. 'force': Leave the output unchanged (not recommended). Otherwise return None and terminate early (saving computation time). """ self._max_seq_length = max_seq_length self._max_predictions_per_seq = max_predictions_per_seq self._tokenizer = tokenization.FullTokenizer( vocab_file, do_lower_case=do_lower_case) self._label_map = label_map self._label_map_inverse = {v: k for k, v in self._label_map.items()} if fall_back_mode.lower() == 'random': self._do_random_mask = True self._do_lazy_generation = False elif fall_back_mode.lower() == 'force': self._do_random_mask = False self._do_lazy_generation = False else: self._do_random_mask = False self._do_lazy_generation = True