def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): try: import ftfy import spacy self.nlp = spacy.load( 'en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning( "ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy." ) self.nlp = BasicTokenizer(do_lower_case=True, never_split=special_tokens if special_tokens is not None else []) self.fix_text = None self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v: k for k, v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} self.set_special_tokens(special_tokens)
def test_basic_tokenizer_lower(self): tokenizer = BasicTokenizer(do_lower_case=True) self.assertListEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]) self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
def __init__(self, entity_candidate_generators: Dict[str, MentionGenerator], entity_indexers: Dict[str, TokenIndexer], bert_model_type: str, do_lower_case: bool, whitespace_tokenize: bool = True, max_word_piece_sequence_length: int = 512) -> None: """ Note: the fields need to be used with a pre-generated allennlp vocabulary that contains the entity id namespaces and the bert name space. entity_indexers = {'wordnet': indexer for wordnet entities, 'wiki': indexer for wiki entities} """ # load BertTokenizer from huggingface self.candidate_generators = entity_candidate_generators self.bert_tokenizer = BertTokenizer.from_pretrained( bert_model_type, do_lower_case=do_lower_case) self.bert_word_tokenizer = BasicTokenizer(do_lower_case=False) # Target length should include start and end token self.max_word_piece_sequence_length = max_word_piece_sequence_length self._entity_indexers = entity_indexers # for bert, we'll give an empty token indexer with empty name space # and do the indexing directly with the bert vocab to bypass # indexing in the indexer self._bert_single_id_indexer = { 'tokens': SingleIdTokenIndexer('__bert__') } self.do_lowercase = do_lower_case self.whitespace_tokenize = whitespace_tokenize self.dtype = np.float32
def __init__(self, do_lower_case: bool = True, never_split: Optional[List[str]] = None) -> None: if never_split is None: # Let BertTokenizer use its default self.basic_tokenizer = BertTokenizer(do_lower_case) else: self.basic_tokenizer = BertTokenizer(do_lower_case, never_split)
def genTraindataEdu(self, goldEDUs): ''' generate training data by labeling every word with 0(I) or 1(B) 1 if this word is the begining of the EDU 0 otherwise ''' p = copy.deepcopy(goldEDUs) # use basic tokenizer instead of bert tokenizer because bert is character # but we are labeling word tokenizer = BasicTokenizer(do_lower_case=True) self.traindataEdu.append([]) self.traindataEdu[self.traindataEduCount].append(['[CLS]']) self.traindataEdu[self.traindataEduCount].append(['[CLS]']) for sent in p: for j, word in enumerate(list(sent)): # split each word with space self.traindataEdu[self.traindataEduCount][0].append(word) if j == 0: self.traindataEdu[self.traindataEduCount][1].append("1") else: self.traindataEdu[self.traindataEduCount][1].append("0") self.traindataEdu[self.traindataEduCount][0].append('[SEP]') self.traindataEdu[self.traindataEduCount][1].append('[SEP]') self.traindataEduCount += 1 return
def _bert_basic(self, text): if self.word_tokenizer is None: from pytorch_pretrained_bert.tokenization import BasicTokenizer self.word_tokenizer = BasicTokenizer(**self.config) return self.word_tokenizer.tokenize(text)
class BertBasicWordSplitter(WordSplitter): """ The ``BasicWordSplitter`` from the BERT implementation. This is used to split a sentence into words. Then the ``BertTokenIndexer`` converts each word into wordpieces. """ def __init__(self, do_lower_case: bool = True) -> None: self.basic_tokenizer = BertTokenizer(do_lower_case) @overrides def split_words(self, sentence: str) -> List[Token]: return [Token(text) for text in self.basic_tokenizer.tokenize(sentence)]
def __init__(self): print('[INFO]加载分词器') self.processor, self.bertTokenizer = init_params() label_list = self.processor.get_labels() self.label_map = {label: i for i, label in enumerate(label_list)} self.tokenizer = BasicTokenizer() print('[INFO]分词器加载完毕') print('[INFO]加载模型') self.model = Bert_CRF() self.model.load_state_dict(load_model(args.output_dir)) self.device = torch.device(args.device if torch.cuda.is_available() and not args.no_cuda else "cpu") self.model.to(self.device) self.model.eval() print('[INFO]模型加载完毕')
class BertPreTokenizer(Tokenizer): """ The ``BasicTokenizer`` from the BERT implementation. This is used to split a sentence into words. Then the ``BertTokenIndexer`` converts each word into wordpieces. """ def __init__(self, do_lower_case: bool = True, never_split: Optional[List[str]] = None) -> None: if never_split is None: # Let BertTokenizer use its default self.basic_tokenizer = BertTokenizer(do_lower_case) else: self.basic_tokenizer = BertTokenizer(do_lower_case, never_split) @overrides def tokenize(self, text: str) -> List[Token]: return [Token(text) for text in self.basic_tokenizer.tokenize(text)]
import random import json from pytorch_pretrained_bert.tokenization import BertTokenizer, BasicTokenizer tokenizer = BertTokenizer.from_pretrained('uncased_L-12_H-768_A-12/vocab.txt') basic_tokenizer = BasicTokenizer() with open('ED-data/Few-Shot_ED.json', 'r') as f: data = json.loads(f.read()) # test1 Movement 5 labels # test2 Conflict 4 labels # test3 Life 10 labels # test4 Sports 4 labels # test5 Business 12 labels # test6 Military 4 labels # test7 Music 5 labels domains = [ 'Movement', 'Conflict', 'Life', 'Sports', 'Business', 'Military', 'Music' ] for order in range(0, 7): print('order', order) test = {} for test_order in range(0, 7): if test_order == order or test_order == (order + 1) % 7: continue B_label = domains[test_order] print(B_label) L_labels = [] # select Movement.*
def test_basic_tokenizer_no_lower(self): tokenizer = BasicTokenizer(do_lower_case=False) self.assertListEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"])
import re import json import random import args import os from pytorch_pretrained_bert.tokenization import BasicTokenizer from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from parser import entity_extractor import torch random.seed(233) tokenizer = BasicTokenizer() test_data = 'data/test_data.pth' class doc: def __init__(self, news_id, content): self.id = news_id self.title = content[0] self.content = content[1:] def make_submit(doc, parser: entity_extractor): entity_cont = {} title_entity = parser.extract([doc.title]) content_entity = parser.extract(doc.content) doc_entity = content_entity | title_entity for entity in doc_entity: if doc.title.find(entity) > -1: entity_cont[entity] = entity_cont.get(entity, 0) + 10 for stc in doc.content:
def test_chinese(self): tokenizer = BasicTokenizer() self.assertListEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"), [u"ah", u"\u535A", u"\u63A8", u"zz"])
def __init__(self, do_lower_case: bool = True) -> None: self.basic_tokenizer = BertTokenizer(do_lower_case)
def test_BasicTokenizer(): model = BasicTokenizer() ## 한글지원 안함 text = "안녕hello월드ahahah國民mrhs" print(model._tokenize_chinese_chars(text)) print(unicodedata.normalize("NFD", text))
def from_config(cls, config: Config): basic_tokenizer = BasicTokenizer(do_lower_case=config.lowercase) return cls(basic_tokenizer)
class OpenAIGPTTokenizer(object): """ BPE tokenizer. Peculiarities: - lower case all inputs - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. - argument special_tokens and function set_special_tokens: can be used to add additional symbols (ex: "__classify__") to a vocabulary. """ @classmethod def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. """ if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[ pretrained_model_name_or_path] merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[ pretrained_model_name_or_path] else: vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) except EnvironmentError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find files {} and {} " "at this path or url.".format( pretrained_model_name_or_path, ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, vocab_file, merges_file)) return None if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: logger.info("loading vocabulary file {}".format(vocab_file)) logger.info("loading merges file {}".format(merges_file)) else: logger.info("loading vocabulary file {} from cache at {}".format( vocab_file, resolved_vocab_file)) logger.info("loading merges file {} from cache at {}".format( merges_file, resolved_merges_file)) if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: # if we're using a pretrained model, ensure the tokenizer wont index sequences longer # than the number of positional embeddings max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ pretrained_model_name_or_path] kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) # Instantiate tokenizer. tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs) return tokenizer def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): try: import ftfy import spacy self.nlp = spacy.load( 'en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning( "ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy." ) self.nlp = BasicTokenizer(do_lower_case=True, never_split=special_tokens if special_tokens is not None else []) self.fix_text = None self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v: k for k, v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} self.set_special_tokens(special_tokens) def __len__(self): return len(self.encoder) + len(self.special_tokens) def set_special_tokens(self, special_tokens): """ Add a list of additional tokens to the encoder. The additional tokens are indexed starting from the last index of the current vocabulary in the order of the `special_tokens` list. """ if not special_tokens: self.special_tokens = {} self.special_tokens_decoder = {} return self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)) self.special_tokens_decoder = { v: k for k, v in self.special_tokens.items() } if self.fix_text is None: # Using BERT's BasicTokenizer: we can update the tokenizer self.nlp.never_split = special_tokens logger.info("Special tokens {}".format(self.special_tokens)) def bpe(self, token): word = tuple(token[:-1]) + (token[-1] + '</w>', ) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: return token + '</w>' while True: bigram = min( pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) new_word.extend(word[i:j]) i = j except: new_word.extend(word[i:]) break if word[i] == first and i < len(word) - 1 and word[ i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = ' '.join(word) if word == '\n </w>': word = '\n</w>' self.cache[token] = word return word def tokenize(self, text): """ Tokenize a string. """ split_tokens = [] if self.fix_text is None: # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: split_tokens.extend([t for t in self.bpe(token).split(' ')]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: split_tokens.extend( [t for t in self.bpe(token.text.lower()).split(' ')]) return split_tokens def convert_tokens_to_ids(self, tokens): """ Converts a sequence of tokens into ids using the vocab. """ ids = [] if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)): if tokens in self.special_tokens: return self.special_tokens[tokens] else: return self.encoder.get(tokens, 0) for token in tokens: if token in self.special_tokens: ids.append(self.special_tokens[token]) else: ids.append(self.encoder.get(token, 0)) if len(ids) > self.max_len: logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this OpenAI GPT model ({} > {}). Running this" " sequence through the model will result in indexing errors". format(len(ids), self.max_len)) return ids def convert_ids_to_tokens(self, ids, skip_special_tokens=False): """Converts a sequence of ids in BPE tokens using the vocab.""" tokens = [] for i in ids: if i in self.special_tokens_decoder: if not skip_special_tokens: tokens.append(self.special_tokens_decoder[i]) else: tokens.append(self.decoder[i]) return tokens def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False): """Converts a sequence of ids in a string.""" tokens = self.convert_ids_to_tokens( ids, skip_special_tokens=skip_special_tokens) out_string = ''.join(tokens).replace('</w>', ' ').strip() if clean_up_tokenization_spaces: out_string = out_string.replace('<unk>', '') out_string = out_string.replace(' .', '.').replace( ' ?', '?').replace(' !', '!').replace(' ,', ',').replace( ' ,', ',').replace(" n't", "n't").replace(" 'm", "'m").replace( " 're", "'re").replace(" do not", " don't").replace( " 's", "'s").replace(" t ", "'t ").replace( " s ", "'s ").replace(" m ", "'m ").replace(" 've", "'ve") return out_string
class BertTokenizerAndCandidateGenerator(Registrable): def __init__(self, entity_candidate_generators: Dict[str, MentionGenerator], entity_indexers: Dict[str, TokenIndexer], bert_model_type: str, do_lower_case: bool, whitespace_tokenize: bool = True, max_word_piece_sequence_length: int = 512) -> None: """ Note: the fields need to be used with a pre-generated allennlp vocabulary that contains the entity id namespaces and the bert name space. entity_indexers = {'wordnet': indexer for wordnet entities, 'wiki': indexer for wiki entities} """ # load BertTokenizer from huggingface self.candidate_generators = entity_candidate_generators self.bert_tokenizer = BertTokenizer.from_pretrained( bert_model_type, do_lower_case=do_lower_case) self.bert_word_tokenizer = BasicTokenizer(do_lower_case=False) # Target length should include start and end token self.max_word_piece_sequence_length = max_word_piece_sequence_length self._entity_indexers = entity_indexers # for bert, we'll give an empty token indexer with empty name space # and do the indexing directly with the bert vocab to bypass # indexing in the indexer self._bert_single_id_indexer = { 'tokens': SingleIdTokenIndexer('__bert__') } self.do_lowercase = do_lower_case self.whitespace_tokenize = whitespace_tokenize self.dtype = np.float32 def _word_to_word_pieces(self, word): if self.do_lowercase and word not in self.bert_tokenizer.basic_tokenizer.never_split: word = word.lower() return self.bert_tokenizer.wordpiece_tokenizer.tokenize(word) def tokenize_and_generate_candidates(self, text_a: str, text_b: str = None): """ # run BertTokenizer.basic_tokenizer.tokenize on sentence1 and sentence2 to word tokenization # generate candidate mentions for each of the generators and for each of sentence1 and 2 from word tokenized text # run BertTokenizer.wordpiece_tokenizer on sentence1 and sentence2 # truncate length, add [CLS] and [SEP] to word pieces # compute token offsets # combine candidate mention spans from sentence1 and sentence2 and remap to word piece indices returns: {'tokens': List[str], the word piece strings with [CLS] [SEP] 'segment_ids': List[int] the same length as 'tokens' with 0/1 for sentence1 vs 2 'candidates': Dict[str, Dict[str, Any]], {'wordnet': {'candidate_spans': List[List[int]], 'candidate_entities': List[List[str]], 'candidate_entity_prior': List[List[float]], 'segment_ids': List[int]}, 'wiki': ...} } """ offsets_a, grouped_wp_a, tokens_a = self._tokenize_text(text_a) if text_b is not None: offsets_b, grouped_wp_b, tokens_b = self._tokenize_text(text_b) truncate_sequence_pair(grouped_wp_a, grouped_wp_b, self.max_word_piece_sequence_length - 3) offsets_b = offsets_b[:len(grouped_wp_b)] tokens_b = tokens_b[:len(grouped_wp_b)] instance_b = self._generate_sentence_entity_candidates( tokens_b, offsets_b) word_piece_tokens_b = [ word_piece for word in grouped_wp_b for word_piece in word ] else: length_a = sum([len(x) for x in grouped_wp_a]) while self.max_word_piece_sequence_length - 2 < length_a: discarded = grouped_wp_a.pop() length_a -= len(discarded) word_piece_tokens_a = [ word_piece for word in grouped_wp_a for word_piece in word ] offsets_a = offsets_a[:len(grouped_wp_a)] tokens_a = tokens_a[:len(grouped_wp_a)] instance_a = self._generate_sentence_entity_candidates( tokens_a, offsets_a) # If we got 2 sentences. if text_b is not None: # Target length should include start and two end tokens, and then be divided equally between both sentences # Note that this will result in potentially shorter documents than original target length, # if one (or both) of the sentences are shorter than half the target length. tokens = [start_token] + word_piece_tokens_a + [ sep_token ] + word_piece_tokens_b + [sep_token] segment_ids = (len(word_piece_tokens_a) + 2) * [0] + (len(word_piece_tokens_b) + 1) * [1] offsets_a = [x + 1 for x in offsets_a] offsets_b = [x + 2 + len(word_piece_tokens_a) for x in offsets_b] # Single sentence else: tokens = [start_token] + word_piece_tokens_a + [sep_token] segment_ids = len(tokens) * [0] offsets_a = [x + 1 for x in offsets_a] offsets_b = None for name in instance_a.keys(): for span in instance_a[name]['candidate_spans']: span[0] += 1 span[1] += 1 fields: Dict[str, Sequence] = {} # concatanating both sentences (for both tokens and ids) if text_b is None: candidates = instance_a else: candidates: Dict[str, Field] = {} # Merging candidate lists for both sentences. for entity_type in instance_b: candidate_instance_a = instance_a[entity_type] candidate_instance_b = instance_b[entity_type] candidates[entity_type] = {} for span in candidate_instance_b['candidate_spans']: span[0] += len(word_piece_tokens_a) + 2 span[1] += len(word_piece_tokens_a) + 2 # Merging each of the fields. for key in [ 'candidate_entities', 'candidate_spans', 'candidate_entity_priors' ]: candidates[entity_type][key] = candidate_instance_a[ key] + candidate_instance_b[key] for entity_type in candidates.keys(): # deal with @@PADDING@@ if len(candidates[entity_type]['candidate_entities']) == 0: candidates[entity_type] = get_empty_candidates() else: padding_indices = [] has_entity = False for cand_i, candidate_list in enumerate( candidates[entity_type]['candidate_entities']): if candidate_list == ["@@PADDING@@"]: padding_indices.append(cand_i) candidates[entity_type]["candidate_spans"][cand_i] = [ -1, -1 ] else: has_entity = True indices_to_remove = [] if has_entity and len(padding_indices) > 0: # remove all the padding entities since have some valid indices_to_remove = padding_indices elif len(padding_indices) > 0: assert len(padding_indices) == len( candidates[entity_type]['candidate_entities']) indices_to_remove = padding_indices[1:] for ind in reversed(indices_to_remove): del candidates[entity_type]["candidate_spans"][ind] del candidates[entity_type]["candidate_entities"][ind] del candidates[entity_type]["candidate_entity_priors"][ind] # get the segment ids for the spans for key, cands in candidates.items(): span_segment_ids = [] for candidate_span in cands['candidate_spans']: span_segment_ids.append(segment_ids[candidate_span[0]]) candidates[key]['candidate_segment_ids'] = span_segment_ids fields['tokens'] = tokens fields['segment_ids'] = segment_ids fields['candidates'] = candidates fields['offsets_a'] = offsets_a fields['offsets_b'] = offsets_b return fields def _tokenize_text(self, text): if self.whitespace_tokenize: tokens = text.split() else: tokens = self.bert_word_tokenizer.tokenize(text) word_piece_tokens = [] offsets = [0] for token in tokens: word_pieces = self._word_to_word_pieces(token) offsets.append(offsets[-1] + len(word_pieces)) word_piece_tokens.append(word_pieces) del offsets[0] return offsets, word_piece_tokens, tokens def _generate_sentence_entity_candidates(self, tokens, offsets): """ Tokenize sentence, trim it to the target length, and generate entity candidates. :param sentence :param target_length: The length of the output sentence in terms of word pieces. :return: Dict[str, Dict[str, Any]], {'wordnet': {'candidate_spans': List[List[int]], 'candidate_entities': List[List[str]], 'candidate_entity_priors': List[List[float]]}, 'wiki': ...} """ assert len(tokens) == len( offsets ), f'Length of tokens {len(tokens)} must equal that of offsets {len(offsets)}.' entity_instances = {} for name, mention_generator in self.candidate_generators.items(): entity_instances[name] = mention_generator.get_mentions_raw_text( ' '.join(tokens), whitespace_tokenize=True) for name, entities in entity_instances.items(): candidate_spans = entities["candidate_spans"] adjusted_spans = [] for start, end in candidate_spans: if 0 < start: adjusted_span = [offsets[start - 1], offsets[end] - 1] else: adjusted_span = [0, offsets[end] - 1] adjusted_spans.append(adjusted_span) entities['candidate_spans'] = adjusted_spans entity_instances[name] = entities return entity_instances def convert_tokens_candidates_to_fields(self, tokens_and_candidates): """ tokens_and_candidates is the return from a previous call to generate_sentence_entity_candidates. Converts the dict to a dict of fields usable with allennlp. """ fields = {} fields['tokens'] = TextField( [ Token(t, text_id=self.bert_tokenizer.vocab[t]) for t in tokens_and_candidates['tokens'] ], token_indexers=self._bert_single_id_indexer) fields['segment_ids'] = ArrayField(np.array( tokens_and_candidates['segment_ids']), dtype=np.int) all_candidates = {} for key, entity_candidates in tokens_and_candidates[ 'candidates'].items(): # pad the prior to create the array field # make a copy to avoid modifying the input candidate_entity_prior = copy.deepcopy( entity_candidates['candidate_entity_priors']) max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) candidate_fields = { "candidate_entity_priors": ArrayField(np_prior, dtype=self.dtype), "candidate_entities": TextField([ Token(" ".join(candidate_list)) for candidate_list in entity_candidates["candidate_entities"] ], token_indexers={'ids': self._entity_indexers[key]}), "candidate_spans": ListField([ SpanField(span[0], span[1], fields['tokens']) for span in entity_candidates['candidate_spans'] ]), "candidate_segment_ids": ArrayField(np.array( entity_candidates['candidate_segment_ids']), dtype=np.int) } all_candidates[key] = DictField(candidate_fields) fields["candidates"] = DictField(all_candidates) return fields
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text, ) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
def __init__(self, do_lower_case: bool = True) -> None: self.basic_tokenizer = BertTokenizer(do_lower_case)
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heruistic between # `pred_text` and `orig_text` to get a character-to-charcter alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
with open(file_stub + '.json', 'w') as f: json.dump(dump_dicts[i], f) else: file_stub = 'tf_idf/%s_dream_test_tfidf_agnostic_all_sorted' % args.prefix with open(file_stub + '.json', 'w') as f: json.dump(dump_dicts[0], f) if __name__ == '__main__': # Parse Args arguments = parse_args() # Load BERT Tokenizer bert_tokenizer = BertTokenizer.from_pretrained(arguments.pretrained, do_lower_case=True) basic_tokenizer = BasicTokenizer(do_lower_case=False) # Create Dataset if arguments.dataset == 'race': D, PA = parse_data(arguments, bert_tokenizer, basic_tokenizer) # Compute TF Matrix TF = compute_tf(PA) # Compute TF-IDF Matrix print('\nComputing TF-IDF Matrix...') transformer = TfidfTransformer() TF_IDF = transformer.fit_transform(TF) assert(TF_IDF.shape[0] == len(PA) == len(TF)) # Compute Scoring Matrix