def make_vocab(self, data_set, use_char=False, embedding_path=None): tf.logging.info("Starting Reading Data in {} Manner".format(use_char)) tokenizer = Tokenizer(do_lower_case=False) for data_iter in [data_set.get_train_data(), data_set.get_dev_data(), data_set.get_test_data()]: for sample in data_iter: label, document = sample tokens = tokenizer.tokenize(document) for token in tokens: if not use_char: self.insert(token) else: for char in list(token): self.insert(char) tf.logging.info("Data Loading Over, Starting Sorted") self.sort_vocab(least_freq=3 if use_char else -1) # process the vocabulary with pretrained-embeddings if embedding_path is not None: tf.logging.info("Pretrained Word Embedding Loading") embed_tokens = {} embed_size = None with open(embedding_path, 'r') as reader: for line in reader: segs = line.strip().split(' ') token = segs[0] # Not used in our training data, pass if token not in self.word2id: continue embed_tokens[token] = list(map(float, segs[1:])) if embed_size is None: embed_size = len(segs) - 1 self.clean() for token in embed_tokens: self.insert(token) # load embeddings embeddings = np.zeros([len(embed_tokens), embed_size]) for token in embed_tokens: # 3: the special symbols embeddings[self.get_id(token) - 3] = embed_tokens[token] self.pretrained_embedding = embeddings tf.logging.info("Vocabulary Loading Finished")
class ChineseTokenizer(): def __init__(self): self._tokenizer = BasicTokenizer(do_lower_case=False) def tokenize_paragraph(self, paragraph): sentences = re.split("。|?|!", paragraph) ret = [] for sent in sentences: if sent: ret.append(self._tokenizer.tokenize(sent) + ["。"]) return ret def tokenize_paragraph_flat(self, paragraph): return self._tokenizer.tokenize(paragraph)
def __init__(self, task_data, max_len, max_w_len, max_p_num, word_vocab, bert_vocab, tokenizer, enable_hierarchy=True, char_vocab=None, enable_char=True, batch_or_token='batch'): self.data_set = task_data self.max_len = max_len self.max_w_len = max_w_len self.max_p_num = max_p_num self.enable_char = enable_char self.batch_or_token = batch_or_token self.word_vocab = word_vocab self.char_vocab = char_vocab if self.enable_char: assert self.char_vocab, 'Character vocabulary must be provided!' self.bert_vocab = bert_vocab self.bert_bpe_tokenizer = tokenizer self.bert_word_tokenizer = Tokenizer(do_lower_case=False) self.enable_bert = not (tokenizer is None or bert_vocab is None) self.enable_hierarchy = enable_hierarchy self.nlp = None self._create_nlp() self.leak_buffer = []
class CharTokenizer(object): """Runs end-to-end tokenziation.""" def __init__(self, vocab_file, do_lower_case=True): self.vocab = load_vocab(vocab_file) self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) def tokenize(self, text): split_tokens = [] #print("\nself.basic_tokenizer.tokenize(text):\n",self.basic_tokenizer.tokenize(text)) for token in self.basic_tokenizer.tokenize(text): for sub_token in token: split_tokens.append(sub_token) return split_tokens def convert_tokens_to_ids(self, tokens): return convert_tokens_to_ids(self.vocab,tokens)
def __init__(self): self._tokenizer = BasicTokenizer(do_lower_case=False)
def __init__(self, vocab_file, do_lower_case=True): self.vocab = load_vocab(vocab_file) self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
from docqa.triviaqa.trivia_qa_eval import exact_match_score from docqa.triviaqa.trivia_qa_eval import f1_score from docqa.triviaqa.trivia_qa_eval import metric_max_over_ground_truths from bert.tokenization import BasicTokenizer if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input_file", required=True) parser.add_argument('--prediction_file', required=True) args = parser.parse_args() input_file = args.input_file prediction_file = args.prediction_file ground_truths = {} tokenizer = BasicTokenizer() with open(input_file, "r") as fin: for line in fin: item = json.loads(line.strip()) ground_truths[item["question_id"]] = [" ".join(tokenizer.tokenize(ans)) for ans in item["answer_text"]] predictions = json.load(open(prediction_file, "r")) f1 = [] em = [] for (qid, pred_text) in predictions.items(): f1.append(metric_max_over_ground_truths(f1_score, pred_text, ground_truths[qid])) em.append(metric_max_over_ground_truths(exact_match_score, pred_text, ground_truths[qid])) import numpy as np print("F1:", np.mean(f1)) print("EM:", np.mean(em))
class Dataset(object): def __init__(self, task_data, max_len, max_w_len, max_p_num, word_vocab, bert_vocab, tokenizer, enable_hierarchy=True, char_vocab=None, enable_char=True, batch_or_token='batch'): self.data_set = task_data self.max_len = max_len self.max_w_len = max_w_len self.max_p_num = max_p_num self.enable_char = enable_char self.batch_or_token = batch_or_token self.word_vocab = word_vocab self.char_vocab = char_vocab if self.enable_char: assert self.char_vocab, 'Character vocabulary must be provided!' self.bert_vocab = bert_vocab self.bert_bpe_tokenizer = tokenizer self.bert_word_tokenizer = Tokenizer(do_lower_case=False) self.enable_bert = not (tokenizer is None or bert_vocab is None) self.enable_hierarchy = enable_hierarchy self.nlp = None self._create_nlp() self.leak_buffer = [] def _create_nlp(self): cls = spacy.util.get_lang_class( 'en') # 1. get Language instance, e.g. English() nlp = cls() nlp.add_pipe(nlp.create_pipe('sentencizer')) self.nlp = nlp # split word-based tokens into sub-word based tokens def _tokenize(self, tokens): tok_to_orig_index = [] orig_to_tok_index = [] all_sub_tokens = [] for (i, token) in enumerate(tokens): orig_to_tok_index.append(len(all_sub_tokens)) sub_tokens = self.bert_bpe_tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_sub_tokens.append(sub_token) return all_sub_tokens, tok_to_orig_index, orig_to_tok_index def load_data(self, train="train"): if train == "train": data_iter = self.data_set.get_train_data() elif train == "dev": data_iter = self.data_set.get_dev_data() else: assert train == "test" data_iter = self.data_set.get_test_data() for sample in data_iter: label, document = sample sentences = [] if self.enable_hierarchy: parsed_document = self.nlp(document.decode('utf-8')) for sentence in parsed_document.sents: tokened_sentence = self.bert_word_tokenizer.tokenize( sentence.string.encode('utf-8')) sentences.append(tokened_sentence) else: sentences.append( self.bert_word_tokenizer.tokenize( document.decode("utf-8"))) yield label, sentences def _process_one_sample(self, sample): sample['token_ids'] = [ self.word_vocab.to_id(sentence, append_eos=False) for sentence in sample['tokens'] ] if self.enable_char: sample['char_ids'] = [] for sentence in sample['tokens']: sample['char_ids'].append([ self.char_vocab.to_id(list(token), append_eos=False) for token in sentence ]) if self.enable_bert: sample['subword_ids'] = [] sample['token_to_subword_index'] = [] sample['subword_to_token_index'] = [] for sentence in sample['tokens']: sub_info = self._tokenize(sentence) sub_tokens = sub_info[0] subword_to_token_index = sub_info[1] token_to_subword_index = sub_info[2] subword_ids = [self.bert_vocab.cls ] + self.bert_vocab.to_id(sub_tokens) sample['subword_ids'].append(subword_ids) sample['subword_to_token_index'].append(subword_to_token_index) sample['token_to_subword_index'].append([ idx if idx < 512 else 0 for idx in token_to_subword_index ]) return sample def to_matrix(self, _batch, train="train"): # pre-tokenize the dataset batch = [] for bidx, _sample in _batch: sample = { "label_id": _sample[0], "tokens": _sample[1], } batch.append((bidx, self._process_one_sample(sample))) # extract maximum numpy statistics max_p_num = max([len(sample['token_ids']) for _, sample in batch]) max_len = max([ len(sentence) for _, sample in batch for sentence in sample['token_ids'] ]) if train == "train": max_p_num = min(max_p_num, self.max_p_num) max_len = min(max_len, self.max_len) max_sub_len = max_len if self.enable_bert: max_sub_len = max([ len(sub_sentence) for _, sample in batch for sub_sentence in sample['subword_ids'] ]) max_sub_len = min(max_sub_len, 512) batch_size = len(batch) samples = { 'sample_idx': np.zeros([batch_size], dtype=np.int32), 'token_ids': np.zeros([batch_size * max_p_num, max_len], dtype=np.int32), 'l_id': np.zeros([batch_size], dtype=np.int32), 'raw': batch } if self.enable_char: samples['char_ids'] = np.zeros( [batch_size * max_p_num, max_len, self.max_w_len], dtype=np.int32) if self.enable_bert: samples['subword_ids'] = np.zeros( [batch_size * max_p_num, max_sub_len], dtype=np.int32) samples['subword_back'] = np.zeros( [batch_size * max_p_num, max_len], dtype=np.int32) for eidx, (sidx, sample) in enumerate(batch): samples['sample_idx'][eidx] = sidx for pidx, _ in enumerate(sample['token_ids']): if pidx >= max_p_num: break f_pidx = eidx * max_p_num + pidx # deal with tokens token_ids = sample['token_ids'][pidx] samples['token_ids'][f_pidx, :min(max_len, len(token_ids) )] = token_ids[:max_len] # deal with chars if self.enable_char: for tidx, c_ids in enumerate(sample['char_ids'][pidx]): if tidx >= max_len: break samples['char_ids'][ f_pidx, tidx, :min(self.max_w_len, len(c_ids) )] = c_ids[:self.max_w_len] # deal with bert if self.enable_bert: subword_ids = sample['subword_ids'][pidx] samples['subword_ids'][ f_pidx, :min(max_sub_len, len(subword_ids) )] = subword_ids[:max_sub_len] subword_back = sample['token_to_subword_index'][pidx] samples['subword_back'][f_pidx, :min( max_len, len(subword_back))] = subword_back[:max_len] samples['l_id'][eidx] = sample['label_id'] return samples @threadsafe_generator def batcher(self, size, buffer_size=1000, shuffle=True, train="train"): # free up the instance length limitation if train != "train": self.max_len = int(1e6) self.batch_or_token = 'batch' def _handle_buffer(_buffer): sorted_buffer = sorted( _buffer, key=lambda xx: max([len(v) for v in xx[1][1]])) if self.batch_or_token == 'batch': buffer_index = batch_indexer(len(sorted_buffer), size) else: buffer_index = token_indexer([[len(v) for v in data[1][1]] for data in sorted_buffer], size) index_over_index = batch_indexer(len(buffer_index), 1) if shuffle: np.random.shuffle(index_over_index) for ioi in index_over_index: index = buffer_index[ioi[0]] batch = [_buffer[ii] for ii in index] yield self.to_matrix(batch, train=train) buffer = self.leak_buffer self.leak_buffer = [] for i, sample in enumerate(self.load_data(train=train)): buffer.append((i, sample)) if len(buffer) >= buffer_size: for data in _handle_buffer(buffer): # check whether the data is tailed # tokens are counted on 'p' batch_size = len(data['raw']) if self.batch_or_token == 'batch' \ else np.sum(data['token_ids'] > 0) if batch_size < size * 0.1: self.leak_buffer += data['raw'] else: yield data buffer = self.leak_buffer self.leak_buffer = [] # deal with data in the buffer if len(buffer) > 0: for data in _handle_buffer(buffer): # check whether the data is tailed batch_size = len(data['raw']) if self.batch_or_token == 'batch' \ else np.sum(data['token_ids'] > 0) if train == 'train' and batch_size < size * 0.1: self.leak_buffer += data['raw'] else: yield data
def process_data(data_file, output_file, vocab_file): """ Adapted from the `gap_to_jsonlines.py` for propara data prep """ tokenizer = BertTokenizerFast(vocab_file=vocab_file) # Need to have this other tokenizer so we can build the sub-token # to token map. It seems huggingface tokenizer doesn't have this # functionality basic_tokenizer = BasicTokenizer(do_lower_case=False) # Load data with open(data_file, 'r') as fp: data = json.load(fp) output_jsons = [] # Format as jsonlines & tokenize for para in tqdm(data): output = {} paragraph_text = " ".join(para['sentence_texts']) # Sentence map sentence_map = [0] for sent_num, sent in enumerate(para['sentence_texts']): tokens = tokenizer.tokenize(sent) sentence_map += [sent_num] * len(tokens) sentence_map += [sentence_map[-1]] # All tokens # Note this is the same as what we used to calculate the sentence map # even though they are done separately tokenized_paragraph = tokenizer(paragraph_text, return_offsets_mapping=True) paragraph_tokens = tokenizer.batch_decode( tokenized_paragraph['input_ids']) token_character_offsets = tokenized_paragraph['offset_mapping'] # Subtoken map # 0 element is for CLS subtoken_map = [0] for tok_id, token in enumerate( basic_tokenizer.tokenize(paragraph_text)): subtokens = tokenizer.tokenize(token) subtoken_map += [tok_id] * len(subtokens) # Add on last subtoken for SEP subtoken_map += [subtoken_map[-1]] output['para_id'] = para['para_id'] output['speakers'] = [['[SPL]'] + ['-'] * \ (len(paragraph_tokens) - 2) + ['[SPL]']] output['sentences'] = [paragraph_tokens] output['sentence_map'] = sentence_map output['clusters'] = [[]] output['subtoken_map'] = subtoken_map output['token_char_spans'] = token_character_offsets output['original_text'] = paragraph_text output['doc_key'] = "wb" # Test, if we know we have a mention on tokens 2-8 # how do we translate that to a span in the original sentence? output_jsons.append(output) # output to output_file with open(output_file, 'w') as fp: for out in output_jsons: fp.write(json.dumps(out) + '\n')
def main(argv): args = argparser().parse_args(argv[1:]) tokenizer = BasicTokenizer(do_lower_case=args.uncased) for fn in args.file: basic_tokenize(tokenizer, fn, args) return 0
def get_final_text(pred_text, orig_text, do_lower_case=True, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heruistic between # `pred_text` and `orig_text` to get a character-to-charcter alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text