def test_basic_tokenizer_lower(self): tokenizer = BasicTokenizer(do_lower_case=True) self.assertListEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]) self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
def test_basic_tokenizer_lower_strip_accents_default(self): tokenizer = BasicTokenizer(do_lower_case=True) self.assertListEqual( tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"] ) self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
def finish_deserializing(self): self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) if self.do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( do_lower_case=self.do_lower_case, never_split=self.never_split, tokenize_chinese_chars=self.tokenize_chinese_chars, ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) super().finish_deserializing()
def __init__(self, config): super(WordEmbedding, self).__init__() self.config = config self.tokenizer = BasicTokenizer(do_lower_case=True) # standard deviation of initialization init_std = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.initializer_range self.words = [] self.word2idx = {} self.embeddings = [] with open(config.MODEL.LANGUAGE_BACKBONE.EMBEDDING_PATH, 'r') as fin: for row in fin: row_tk = row.split() self.words.append(row_tk[0]) self.word2idx[row_tk[0]] = len(self.words) - 1 self.embeddings.append([float(num) for num in row_tk[1:]]) self.embeddings = torch.tensor( np.asarray(self.embeddings, dtype=np.float32)).cuda() self.embeddings = nn.Parameter(self.embeddings) self.out_channels = self.embeddings.shape[-1] if self.config.MODEL.LANGUAGE_BACKBONE.FREEZE: self.embeddings.requires_grad = False self.words.extend(['[OOV]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']) self.oov_idx = len(self.words) - 5 self.pad_idx = len(self.words) - 4 self.cls_idx = len(self.words) - 3 self.sep_idx = len(self.words) - 2 self.mask_idx = len(self.words) - 1 self.special_tokens = set([ self.oov_idx, self.pad_idx, self.cls_idx, self.sep_idx, self.mask_idx ]) self.special_embeddings = nn.Parameter( torch.zeros(5, self.out_channels).cuda()) self.special_embeddings.data.normal_(mean=0.0, std=init_std) self.aug_embeddings = torch.cat( [self.embeddings, self.special_embeddings], dim=0) head_config = self.config.MODEL.MMSS_HEAD.TRANSFORMER self.mlm = head_config.MASKED_LANGUAGE_MODELING self.mlm_prob = head_config.MASKED_LANGUAGE_MODELING_PROB self.mlm_prob_mask = head_config.MASKED_LANGUAGE_MODELING_PROB_MASK self.mlm_prob_noise = head_config.MASKED_LANGUAGE_MODELING_PROB_NOISE self.mlm_during_validation = head_config.MASKED_LANGUAGE_MODELING_VALIDATION self.add_position_embedding = config.MODEL.LANGUAGE_BACKBONE.ADD_POSITION_EMBEDDING if self.add_position_embedding: # maximum length of a sentence m = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.max_position_embeddings self.position_embedding = nn.Parameter( torch.zeros(m, self.out_channels)) self.position_embedding.data.normal_(mean=0.0, std=init_std)
def customize_tokenizer(text, do_lower_case=False): tokenizer = BasicTokenizer(do_lower_case=do_lower_case) temp_x = "" #text = convert_to_unicode(text) for c in text: if (tokenizer._is_chinese_char(ord(c)) or _is_punctuation(c) or _is_whitespace(c) or _is_control(c)): temp_x += " " + c + " " else: temp_x += c if do_lower_case: temp_x = temp_x.lower() return temp_x.split()
def build_from_p_e_m_file(p_e_m_file, dump_db_file, wiki_mention_db_file, **kwargs): dump_db = DumpDB(dump_db_file) tokenizer = BasicTokenizer(do_lower_case=False) normalizer = BertLowercaseNormalizer() wiki_mention_db = MentionDB(wiki_mention_db_file) MentionDB.build_from_p_e_m_file(p_e_m_file, dump_db, wiki_mention_db, tokenizer, normalizer, **kwargs)
class BertLowercaseNormalizer(object): def __init__(self, never_lowercase=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): self._tokenizer = BasicTokenizer() self._never_lowercase = frozenset(never_lowercase) def normalize(self, token): if token not in self._never_lowercase: token = token.lower() token = self._tokenizer._run_strip_accents(token) return token
def from_config(cls, config: Config): basic_tokenizer = BasicTokenizer( do_lower_case=config.lowercase, never_split=( "[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", ), # compatibility with HF v0.5 ) return cls(basic_tokenizer)
def __init__(self, do_lower_case: bool = True, never_split: Optional[List[str]] = None) -> None: if never_split is None: never_split = self.default_never_split else: never_split = never_split + self.default_never_split self.basic_tokenizer = BertTokenizer(do_lower_case, never_split) self.basic_tokenizer._run_split_on_punc = self._run_split_on_punc self.never_split = never_split
class BertPreTokenizer(Tokenizer): """ The ``BasicTokenizer`` from the BERT implementation. This is used to split a sentence into words. Then the ``BertTokenIndexer`` converts each word into wordpieces. """ default_never_split = ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] def __init__(self, do_lower_case: bool = True, never_split: Optional[List[str]] = None) -> None: if never_split is None: never_split = self.default_never_split else: never_split = never_split + self.default_never_split self.basic_tokenizer = BertTokenizer(do_lower_case, never_split) self.basic_tokenizer._run_split_on_punc = self._run_split_on_punc self.never_split = never_split @overrides def tokenize(self, text: str) -> List[Token]: return [Token(text) for text in self.basic_tokenizer.tokenize(text)] # HACK: Monkeypatch for huggingface's broken BasicTokenizer. # TODO(Mark): Remove this once https://github.com/huggingface/transformers/pull/2557 # is merged. def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" if never_split is None: never_split = self.never_split if never_split is not None and text in never_split: return [text] chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if _is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output]
class SerializableBertTokenizer(transformers.BertTokenizer, SerializationMixin): serialization_fields = list(BASE_CLASS_FIELDS) + [ "vocab", "do_basic_tokenize", "do_lower_case", "never_split", "tokenize_chinese_chars", ] @classmethod def blank(cls): self = cls.__new__(cls) for field in self.serialization_fields: setattr(self, field, None) self.ids_to_tokens = None self.basic_tokenizer = None self.wordpiece_tokenizer = None return self def prepare_for_serialization(self): if self.basic_tokenizer is not None: self.do_lower_case = self.basic_tokenizer.do_lower_case self.never_split = self.basic_tokenizer.never_split self.tokenize_chinese_chars = self.basic_tokenizer.tokenize_chinese_chars super().prepare_for_serialization() def finish_deserializing(self): self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) if self.do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( do_lower_case=self.do_lower_case, never_split=self.never_split, tokenize_chinese_chars=self.tokenize_chinese_chars, ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) super().finish_deserializing() def clean_token(self, text): if self.do_basic_tokenize: text = self.basic_tokenizer._clean_text(text) text = text.strip() return clean_accents(text) def clean_wp_token(self, token): return token.replace("##", "", 1).strip() def add_special_tokens(self, segments): output = [] for segment in segments: output.extend(segment) if segment: output.append(self.sep_token) if output: # If we otherwise would have an empty output, don't add cls output.insert(0, self.cls_token) return output def fix_alignment(self, segments): """Turn a nested segment alignment into an alignment for the whole input, by offsetting and accounting for special tokens.""" offset = 0 output = [] for segment in segments: if segment: offset += 1 seen = set() for idx_group in segment: output.append([idx + offset for idx in idx_group]) seen.update({idx for idx in idx_group}) offset += len(seen) return output
def test_basic_tokenizer_no_lower(self): tokenizer = BasicTokenizer(do_lower_case=False) self.assertListEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"])
def __init__(self, never_lowercase=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): self._tokenizer = BasicTokenizer() self._never_lowercase = frozenset(never_lowercase)
def test_chinese(self): tokenizer = BasicTokenizer() self.assertListEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"), [u"ah", u"\u535A", u"\u63A8", u"zz"])
def test_basic_tokenizer_respects_never_split_tokens(self): tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"]) self.assertListEqual( tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"])
def test_basic_tokenizer_no_lower_strip_accents_true(self): tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True) self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"])
def get_final_text(pred_text, orig_text, do_lower_case): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heuristic between # `pred_text` and `orig_text` to get a character-to-character alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for idx, c in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = idx ns_chars.append(c) ns_text = "".join(ns_chars) return ns_text, ns_to_s_map # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: return orig_text output_text = orig_text[orig_start_position : (orig_end_position + 1)] return output_text
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
def read_sequence(self, dataset, path, is_train, max_sents): """ Reads conllu-like files. It relies heavily on reader_utils.seqs2data. Can also read sentence classification tasks for which the labels should be specified in the comments. Note that this read corresponds to a variety of task_types, but the differences between them during data reading are kept minimal """ data = [] word_idx = self.datasets[dataset]['word_idx'] sent_counter = 0 tknzr = BasicTokenizer() for sent, full_data in seqs2data(path, self.do_lowercase): task2type = {} sent_counter += 1 if max_sents != 0 and sent_counter > max_sents: break sent_tasks = {} tokens = [token[word_idx] for token in sent] for tokenIdx in range(len(tokens)): if len(tknzr._clean_text(tokens[tokenIdx])) == 0: tokens[tokenIdx] = self.tokenizer.tokenizer.unk_token sent_tasks['tokens'] = [Token(token) for token in tokens] col_idxs = {'word_idx': word_idx} for task in self.datasets[dataset]['tasks']: sent_tasks[task] = [] task_type = self.datasets[dataset]['tasks'][task]['task_type'] task_idx = self.datasets[dataset]['tasks'][task]['column_idx'] task2type[task] = task_type col_idxs[task] = task_idx if task_type == 'classification' and task_idx == -1: start = '# ' + task + ': ' for line in full_data: if line[0].startswith(start): sent_tasks[task] = line[0][len(start):] elif task_type in ['seq', 'multiseq', 'seq_bio']: for word_data in sent: sent_tasks[task].append(word_data[task_idx]) elif task_type == 'string2string': for word_data in sent: task_label = gen_lemma_rule(word_data[word_idx], word_data[task_idx]) sent_tasks[task].append(task_label) elif task_type == 'dependency': heads = [] rels = [] for word_data in sent: if not word_data[task_idx].isdigit(): logger.error( "Your dependency file " + path + " seems to contain invalid structures sentence " + str(sent_counter) + " contains a non-integer head: " + word_data[task_idx] + "\nIf you directly used UD data, this could be due to special EUD constructions which we do not support, you can clean your conllu file by using scripts/misc/cleanconl.py" ) exit(1) heads.append(int(word_data[task_idx])) rels.append(word_data[task_idx + 1]) sent_tasks[task] = list(zip(rels, heads)) else: logger.error('Task type ' + task_type + ' for task ' + task + ' in dataset ' + dataset + ' is unknown') data.append( self.text_to_instance(sent_tasks, full_data, col_idxs, is_train, task2type, dataset)) return data
import sys from collections import OrderedDict import json from transformers.tokenization_bert import BasicTokenizer tokenizer = BasicTokenizer(do_lower_case=True) data1 = json.load(open(sys.argv[1]),object_pairs_hook=OrderedDict) data2 = json.load(open(sys.argv[2])) for key,value in data1.items(): value = tokenizer.tokenize(value) if len(value) >30 and data2[key] != "" and data2[key] != "empty": data1[key] = data2[key] json.dump(data1,open(sys.argv[3],"w"),ensure_ascii=False,indent=4)
class WordEmbedding(nn.Module): def __init__(self, config): super(WordEmbedding, self).__init__() self.config = config self.tokenizer = BasicTokenizer(do_lower_case=True) # standard deviation of initialization init_std = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.initializer_range self.words = [] self.word2idx = {} self.embeddings = [] with open(config.MODEL.LANGUAGE_BACKBONE.EMBEDDING_PATH, 'r') as fin: for row in fin: row_tk = row.split() self.words.append(row_tk[0]) self.word2idx[row_tk[0]] = len(self.words) - 1 self.embeddings.append([float(num) for num in row_tk[1:]]) self.embeddings = torch.tensor( np.asarray(self.embeddings, dtype=np.float32)).cuda() self.embeddings = nn.Parameter(self.embeddings) self.out_channels = self.embeddings.shape[-1] if self.config.MODEL.LANGUAGE_BACKBONE.FREEZE: self.embeddings.requires_grad = False self.words.extend(['[OOV]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']) self.oov_idx = len(self.words) - 5 self.pad_idx = len(self.words) - 4 self.cls_idx = len(self.words) - 3 self.sep_idx = len(self.words) - 2 self.mask_idx = len(self.words) - 1 self.special_tokens = set([ self.oov_idx, self.pad_idx, self.cls_idx, self.sep_idx, self.mask_idx ]) self.special_embeddings = nn.Parameter( torch.zeros(5, self.out_channels).cuda()) self.special_embeddings.data.normal_(mean=0.0, std=init_std) self.aug_embeddings = torch.cat( [self.embeddings, self.special_embeddings], dim=0) head_config = self.config.MODEL.MMSS_HEAD.TRANSFORMER self.mlm = head_config.MASKED_LANGUAGE_MODELING self.mlm_prob = head_config.MASKED_LANGUAGE_MODELING_PROB self.mlm_prob_mask = head_config.MASKED_LANGUAGE_MODELING_PROB_MASK self.mlm_prob_noise = head_config.MASKED_LANGUAGE_MODELING_PROB_NOISE self.mlm_during_validation = head_config.MASKED_LANGUAGE_MODELING_VALIDATION self.add_position_embedding = config.MODEL.LANGUAGE_BACKBONE.ADD_POSITION_EMBEDDING if self.add_position_embedding: # maximum length of a sentence m = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.max_position_embeddings self.position_embedding = nn.Parameter( torch.zeros(m, self.out_channels)) self.position_embedding.data.normal_(mean=0.0, std=init_std) def forward(self, text_list): tokenized_batch = { 'input_ids': [], 'attention_mask': [], 'encoded_tokens': [], 'input_embeddings': [], 'special_tokens_mask': [], } for i in range(len(text_list)): tokens = self.tokenizer.tokenize(text_list[i]) ids = [self.word2idx.get(t, self.oov_idx) for t in tokens] ids = [self.cls_idx] + ids + [self.sep_idx] tokenized_batch['input_ids'].append(ids) max_len = max([len(i) for i in tokenized_batch['input_ids']]) for i in range(len(text_list)): ids = tokenized_batch['input_ids'][i] l = len(ids) ids.extend([self.pad_idx] * (max_len - l)) if self.mlm: tokenized_batch['target_ids'] = deepcopy( tokenized_batch['input_ids']) tokenized_batch['mlm_mask'] = [] for i, item in enumerate(tokenized_batch['input_ids']): mlm_mask = [] for j in range(len(item)): if (item[j] in self.special_tokens or not (self.training or self.mlm_during_validation)): mlm_mask.append(0) continue prob = np.random.rand() if prob < self.mlm_prob: mlm_mask.append(1) prob /= self.mlm_prob if prob < self.mlm_prob_mask: item[j] = self.mask_idx elif prob < self.mlm_prob_mask + self.mlm_prob_noise: # assuming special tokens are at the end of the words list item[j] = np.random.randint( len(self.words) - len(self.special_tokens)) else: mlm_mask.append(0) tokenized_batch['mlm_mask'].append(mlm_mask) for i in range(len(text_list)): ids = np.asarray(tokenized_batch['input_ids'][i]) tokenized_batch['attention_mask'].append( (ids != self.pad_idx).astype(np.int64)) enc = self.aug_embeddings[ids] tokenized_batch['input_embeddings'].append(enc) if self.add_position_embedding: enc = enc + self.position_embedding[:max_len] tokenized_batch['encoded_tokens'].append(enc) sp_mask = [] for tk in ids: if tk in self.special_tokens: sp_mask.append(1) else: sp_mask.append(0) tokenized_batch['special_tokens_mask'].append(sp_mask) tokenized_batch['input_embeddings'] = torch.stack( tokenized_batch['input_embeddings'], dim=0) tokenized_batch['encoded_tokens'] = torch.stack( tokenized_batch['encoded_tokens'], dim=0) tokenized_batch['input_ids'] = torch.tensor( tokenized_batch['input_ids']).cuda() tokenized_batch['attention_mask'] = torch.tensor( tokenized_batch['attention_mask']).cuda() tokenized_batch['special_tokens_mask'] = torch.tensor( tokenized_batch['special_tokens_mask']).cuda() if self.mlm: tokenized_batch['mlm_mask'] = torch.tensor( tokenized_batch['mlm_mask']).cuda() tokenized_batch['target_ids'] = torch.tensor( tokenized_batch['target_ids']).cuda() return tokenized_batch
def basic_tokenize(string): """Use Bert BasicTokenizer as the tokenizer.""" return BasicTokenizer().tokenize(string)
import logging import json import csv import sys import pandas as pd import numpy as np from scipy.stats import skew, kurtosis from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis import jieba.posseg as psg from transformers.tokenization_bert import BasicTokenizer __author__ = "*****@*****.**" tokenizer = BasicTokenizer(do_lower_case=True) logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger() eps = 10e-8 # ensemble_list = ["14", "17", "21", "22", "23", "25", "26", "27", "28", "29", "33", "34", "35", "36", "37"] ensemble_list = [ "14", "17", "21", "22", "23", "25", "26", "27", "28", "29", "33", "34", "35", "36", "37", "38", "39" ]
from dataclasses import dataclass, field import regex as re from typing import List, Union from relogic.structures.structure import Structure from relogic.structures.token import Token from relogic.structures.span import Span from transformers.tokenization_bert import BasicTokenizer basic_tokenizer = BasicTokenizer(do_lower_case=False) PAT = re.compile( r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ) @dataclass class Sentence(Structure): idx: int = None text: str = None tokens: List[Token] = field(default_factory=list) text_: str = None pos: List = field(default_factory=list) spans: List[Span] = field(default_factory=list) predicate_text: str = None predicate_index: int = None predicates: List = field(default_factory=list) srl_labels: List = field(default_factory=list) tokenizer: str = "space"
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heuristic between # `pred_text` and `orig_text` to get a character-to-character alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. # There are some unicodes that causes an error, replace it as ' ' or ''. orig_text = orig_text.replace(u'\xa0', u' ') orig_text = orig_text.replace('', '') orig_text = orig_text.replace(u'\u200e', u'') pred_text = pred_text.replace(u'\xa0', u' ') pred_text = pred_text.replace('', '') pred_text = pred_text.replace(u'\u200e', u'') tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) # Also, at some cases, if pred_text contains [UNK] or '##', it causes error and this function return just orig_text. # To fix this, we did some pre-processing for pred_text based on tok_text. correct_text = word_correction(pred_text, tok_text) if correct_text == -1: if verbose_logging: logger.info("Fail to correction: '%s' using '%s'" % (pred_text, tok_text)) return orig_text pred_text = correct_text start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, tok_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) logger.info("Lenth error: '%s' vs '%s'", orig_text, tok_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
def build_from_wikipedia(dump_db_file, **kwargs): dump_db = DumpDB(dump_db_file) tokenizer = BasicTokenizer(do_lower_case=False) normalizer = BertLowercaseNormalizer() MentionDB.build_from_wikipedia(dump_db, tokenizer, normalizer, **kwargs)