Example #1
0
    def make_vocab(self, data_set, use_char=False, embedding_path=None):
        tf.logging.info("Starting Reading Data in {} Manner".format(use_char))
        tokenizer = Tokenizer(do_lower_case=False)

        for data_iter in [data_set.get_train_data(),
                          data_set.get_dev_data(),
                          data_set.get_test_data()]:
            for sample in data_iter:
                label, document = sample

                tokens = tokenizer.tokenize(document)
                for token in tokens:
                    if not use_char:
                        self.insert(token)
                    else:
                        for char in list(token):
                            self.insert(char)

        tf.logging.info("Data Loading Over, Starting Sorted")
        self.sort_vocab(least_freq=3 if use_char else -1)

        # process the vocabulary with pretrained-embeddings
        if embedding_path is not None:
            tf.logging.info("Pretrained Word Embedding Loading")
            embed_tokens = {}
            embed_size = None
            with open(embedding_path, 'r') as reader:
                for line in reader:
                    segs = line.strip().split(' ')

                    token = segs[0]
                    # Not used in our training data, pass
                    if token not in self.word2id:
                        continue
                    embed_tokens[token] = list(map(float, segs[1:]))

                    if embed_size is None:
                        embed_size = len(segs) - 1

            self.clean()
            for token in embed_tokens:
                self.insert(token)

            # load embeddings
            embeddings = np.zeros([len(embed_tokens), embed_size])
            for token in embed_tokens:
                # 3: the special symbols
                embeddings[self.get_id(token) - 3] = embed_tokens[token]

            self.pretrained_embedding = embeddings

        tf.logging.info("Vocabulary Loading Finished")
Example #2
0
class ChineseTokenizer():
    def __init__(self):
        self._tokenizer = BasicTokenizer(do_lower_case=False)

    def tokenize_paragraph(self, paragraph):
        sentences = re.split("。|?|!", paragraph)
        ret = []
        for sent in sentences:
            if sent:
                ret.append(self._tokenizer.tokenize(sent) + ["。"])
        return ret

    def tokenize_paragraph_flat(self, paragraph):
        return self._tokenizer.tokenize(paragraph)
Example #3
0
    def __init__(self,
                 task_data,
                 max_len,
                 max_w_len,
                 max_p_num,
                 word_vocab,
                 bert_vocab,
                 tokenizer,
                 enable_hierarchy=True,
                 char_vocab=None,
                 enable_char=True,
                 batch_or_token='batch'):
        self.data_set = task_data
        self.max_len = max_len
        self.max_w_len = max_w_len
        self.max_p_num = max_p_num
        self.enable_char = enable_char
        self.batch_or_token = batch_or_token

        self.word_vocab = word_vocab
        self.char_vocab = char_vocab
        if self.enable_char:
            assert self.char_vocab, 'Character vocabulary must be provided!'

        self.bert_vocab = bert_vocab
        self.bert_bpe_tokenizer = tokenizer
        self.bert_word_tokenizer = Tokenizer(do_lower_case=False)
        self.enable_bert = not (tokenizer is None or bert_vocab is None)

        self.enable_hierarchy = enable_hierarchy

        self.nlp = None
        self._create_nlp()

        self.leak_buffer = []
class CharTokenizer(object):
    """Runs end-to-end tokenziation."""
    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
    
    def tokenize(self, text):
        split_tokens = []
        #print("\nself.basic_tokenizer.tokenize(text):\n",self.basic_tokenizer.tokenize(text))
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in token:
                split_tokens.append(sub_token)
        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        return convert_tokens_to_ids(self.vocab,tokens)
Example #5
0
 def __init__(self):
     self._tokenizer = BasicTokenizer(do_lower_case=False)
 def __init__(self, vocab_file, do_lower_case=True):
     self.vocab = load_vocab(vocab_file)
     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
Example #7
0
from docqa.triviaqa.trivia_qa_eval import exact_match_score
from docqa.triviaqa.trivia_qa_eval import f1_score
from docqa.triviaqa.trivia_qa_eval import metric_max_over_ground_truths

from bert.tokenization import BasicTokenizer

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", required=True) 
    parser.add_argument('--prediction_file', required=True) 
    args = parser.parse_args()
    input_file = args.input_file 
    prediction_file = args.prediction_file 

    ground_truths = {}
    tokenizer = BasicTokenizer()
    with open(input_file, "r") as fin:
      for line in fin:
        item = json.loads(line.strip())
        ground_truths[item["question_id"]] = [" ".join(tokenizer.tokenize(ans)) for ans in item["answer_text"]]

    predictions = json.load(open(prediction_file, "r"))
    f1 = []
    em = []
    for (qid, pred_text) in predictions.items():
      f1.append(metric_max_over_ground_truths(f1_score, pred_text, ground_truths[qid]))
      em.append(metric_max_over_ground_truths(exact_match_score, pred_text, ground_truths[qid]))

    import numpy as np
    print("F1:", np.mean(f1))
    print("EM:", np.mean(em))
Example #8
0
class Dataset(object):
    def __init__(self,
                 task_data,
                 max_len,
                 max_w_len,
                 max_p_num,
                 word_vocab,
                 bert_vocab,
                 tokenizer,
                 enable_hierarchy=True,
                 char_vocab=None,
                 enable_char=True,
                 batch_or_token='batch'):
        self.data_set = task_data
        self.max_len = max_len
        self.max_w_len = max_w_len
        self.max_p_num = max_p_num
        self.enable_char = enable_char
        self.batch_or_token = batch_or_token

        self.word_vocab = word_vocab
        self.char_vocab = char_vocab
        if self.enable_char:
            assert self.char_vocab, 'Character vocabulary must be provided!'

        self.bert_vocab = bert_vocab
        self.bert_bpe_tokenizer = tokenizer
        self.bert_word_tokenizer = Tokenizer(do_lower_case=False)
        self.enable_bert = not (tokenizer is None or bert_vocab is None)

        self.enable_hierarchy = enable_hierarchy

        self.nlp = None
        self._create_nlp()

        self.leak_buffer = []

    def _create_nlp(self):
        cls = spacy.util.get_lang_class(
            'en')  # 1. get Language instance, e.g. English()
        nlp = cls()

        nlp.add_pipe(nlp.create_pipe('sentencizer'))
        self.nlp = nlp

    # split word-based tokens into sub-word based tokens
    def _tokenize(self, tokens):
        tok_to_orig_index = []
        orig_to_tok_index = []
        all_sub_tokens = []
        for (i, token) in enumerate(tokens):
            orig_to_tok_index.append(len(all_sub_tokens))
            sub_tokens = self.bert_bpe_tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_sub_tokens.append(sub_token)
        return all_sub_tokens, tok_to_orig_index, orig_to_tok_index

    def load_data(self, train="train"):
        if train == "train":
            data_iter = self.data_set.get_train_data()
        elif train == "dev":
            data_iter = self.data_set.get_dev_data()
        else:
            assert train == "test"
            data_iter = self.data_set.get_test_data()

        for sample in data_iter:
            label, document = sample

            sentences = []
            if self.enable_hierarchy:
                parsed_document = self.nlp(document.decode('utf-8'))
                for sentence in parsed_document.sents:
                    tokened_sentence = self.bert_word_tokenizer.tokenize(
                        sentence.string.encode('utf-8'))

                    sentences.append(tokened_sentence)
            else:
                sentences.append(
                    self.bert_word_tokenizer.tokenize(
                        document.decode("utf-8")))

            yield label, sentences

    def _process_one_sample(self, sample):
        sample['token_ids'] = [
            self.word_vocab.to_id(sentence, append_eos=False)
            for sentence in sample['tokens']
        ]

        if self.enable_char:
            sample['char_ids'] = []
            for sentence in sample['tokens']:
                sample['char_ids'].append([
                    self.char_vocab.to_id(list(token), append_eos=False)
                    for token in sentence
                ])

        if self.enable_bert:
            sample['subword_ids'] = []
            sample['token_to_subword_index'] = []
            sample['subword_to_token_index'] = []

            for sentence in sample['tokens']:
                sub_info = self._tokenize(sentence)
                sub_tokens = sub_info[0]
                subword_to_token_index = sub_info[1]
                token_to_subword_index = sub_info[2]
                subword_ids = [self.bert_vocab.cls
                               ] + self.bert_vocab.to_id(sub_tokens)

                sample['subword_ids'].append(subword_ids)
                sample['subword_to_token_index'].append(subword_to_token_index)
                sample['token_to_subword_index'].append([
                    idx if idx < 512 else 0 for idx in token_to_subword_index
                ])

        return sample

    def to_matrix(self, _batch, train="train"):
        # pre-tokenize the dataset
        batch = []
        for bidx, _sample in _batch:
            sample = {
                "label_id": _sample[0],
                "tokens": _sample[1],
            }

            batch.append((bidx, self._process_one_sample(sample)))

        # extract maximum numpy statistics
        max_p_num = max([len(sample['token_ids']) for _, sample in batch])
        max_len = max([
            len(sentence) for _, sample in batch
            for sentence in sample['token_ids']
        ])

        if train == "train":
            max_p_num = min(max_p_num, self.max_p_num)
            max_len = min(max_len, self.max_len)

        max_sub_len = max_len
        if self.enable_bert:
            max_sub_len = max([
                len(sub_sentence) for _, sample in batch
                for sub_sentence in sample['subword_ids']
            ])
            max_sub_len = min(max_sub_len, 512)

        batch_size = len(batch)

        samples = {
            'sample_idx':
            np.zeros([batch_size], dtype=np.int32),
            'token_ids':
            np.zeros([batch_size * max_p_num, max_len], dtype=np.int32),
            'l_id':
            np.zeros([batch_size], dtype=np.int32),
            'raw':
            batch
        }
        if self.enable_char:
            samples['char_ids'] = np.zeros(
                [batch_size * max_p_num, max_len, self.max_w_len],
                dtype=np.int32)
        if self.enable_bert:
            samples['subword_ids'] = np.zeros(
                [batch_size * max_p_num, max_sub_len], dtype=np.int32)
            samples['subword_back'] = np.zeros(
                [batch_size * max_p_num, max_len], dtype=np.int32)

        for eidx, (sidx, sample) in enumerate(batch):
            samples['sample_idx'][eidx] = sidx

            for pidx, _ in enumerate(sample['token_ids']):
                if pidx >= max_p_num:
                    break
                f_pidx = eidx * max_p_num + pidx

                # deal with tokens
                token_ids = sample['token_ids'][pidx]
                samples['token_ids'][f_pidx, :min(max_len, len(token_ids)
                                                  )] = token_ids[:max_len]

                # deal with chars
                if self.enable_char:
                    for tidx, c_ids in enumerate(sample['char_ids'][pidx]):
                        if tidx >= max_len:
                            break
                        samples['char_ids'][
                            f_pidx, tidx, :min(self.max_w_len, len(c_ids)
                                               )] = c_ids[:self.max_w_len]

                # deal with bert
                if self.enable_bert:
                    subword_ids = sample['subword_ids'][pidx]
                    samples['subword_ids'][
                        f_pidx, :min(max_sub_len, len(subword_ids)
                                     )] = subword_ids[:max_sub_len]
                    subword_back = sample['token_to_subword_index'][pidx]
                    samples['subword_back'][f_pidx, :min(
                        max_len, len(subword_back))] = subword_back[:max_len]

            samples['l_id'][eidx] = sample['label_id']

        return samples

    @threadsafe_generator
    def batcher(self, size, buffer_size=1000, shuffle=True, train="train"):
        # free up the instance length limitation
        if train != "train":
            self.max_len = int(1e6)
            self.batch_or_token = 'batch'

        def _handle_buffer(_buffer):
            sorted_buffer = sorted(
                _buffer, key=lambda xx: max([len(v) for v in xx[1][1]]))

            if self.batch_or_token == 'batch':
                buffer_index = batch_indexer(len(sorted_buffer), size)
            else:
                buffer_index = token_indexer([[len(v) for v in data[1][1]]
                                              for data in sorted_buffer], size)

            index_over_index = batch_indexer(len(buffer_index), 1)
            if shuffle:
                np.random.shuffle(index_over_index)

            for ioi in index_over_index:
                index = buffer_index[ioi[0]]
                batch = [_buffer[ii] for ii in index]
                yield self.to_matrix(batch, train=train)

        buffer = self.leak_buffer
        self.leak_buffer = []
        for i, sample in enumerate(self.load_data(train=train)):
            buffer.append((i, sample))
            if len(buffer) >= buffer_size:
                for data in _handle_buffer(buffer):
                    # check whether the data is tailed
                    # tokens are counted on 'p'
                    batch_size = len(data['raw']) if self.batch_or_token == 'batch' \
                        else np.sum(data['token_ids'] > 0)
                    if batch_size < size * 0.1:
                        self.leak_buffer += data['raw']
                    else:
                        yield data
                buffer = self.leak_buffer
                self.leak_buffer = []

        # deal with data in the buffer
        if len(buffer) > 0:
            for data in _handle_buffer(buffer):
                # check whether the data is tailed
                batch_size = len(data['raw']) if self.batch_or_token == 'batch' \
                    else np.sum(data['token_ids'] > 0)
                if train == 'train' and batch_size < size * 0.1:
                    self.leak_buffer += data['raw']
                else:
                    yield data
def process_data(data_file, output_file, vocab_file):
    """
    Adapted from the `gap_to_jsonlines.py` for propara data prep
    """
    tokenizer = BertTokenizerFast(vocab_file=vocab_file)
    # Need to have this other tokenizer so we can build the sub-token
    # to token map. It seems huggingface tokenizer doesn't have this
    # functionality
    basic_tokenizer = BasicTokenizer(do_lower_case=False)

    # Load data
    with open(data_file, 'r') as fp:
        data = json.load(fp)

    output_jsons = []
    # Format as jsonlines & tokenize
    for para in tqdm(data):
        output = {}
        paragraph_text = " ".join(para['sentence_texts'])

        # Sentence map
        sentence_map = [0]
        for sent_num, sent in enumerate(para['sentence_texts']):
            tokens = tokenizer.tokenize(sent)
            sentence_map += [sent_num] * len(tokens)
        sentence_map += [sentence_map[-1]]

        # All tokens
        # Note this is the same as what we used to calculate the sentence map
        # even though they are done separately
        tokenized_paragraph = tokenizer(paragraph_text,
                                        return_offsets_mapping=True)
        paragraph_tokens = tokenizer.batch_decode(
            tokenized_paragraph['input_ids'])
        token_character_offsets = tokenized_paragraph['offset_mapping']

        # Subtoken map
        # 0 element is for CLS
        subtoken_map = [0]
        for tok_id, token in enumerate(
                basic_tokenizer.tokenize(paragraph_text)):
            subtokens = tokenizer.tokenize(token)
            subtoken_map += [tok_id] * len(subtokens)
        # Add on last subtoken for SEP
        subtoken_map += [subtoken_map[-1]]

        output['para_id'] = para['para_id']
        output['speakers'] = [['[SPL]'] + ['-'] * \
            (len(paragraph_tokens) - 2) + ['[SPL]']]
        output['sentences'] = [paragraph_tokens]
        output['sentence_map'] = sentence_map
        output['clusters'] = [[]]
        output['subtoken_map'] = subtoken_map
        output['token_char_spans'] = token_character_offsets
        output['original_text'] = paragraph_text
        output['doc_key'] = "wb"

        # Test, if we know we have a mention on tokens 2-8
        # how do we translate that to a span in the original sentence?
        output_jsons.append(output)

    # output to output_file
    with open(output_file, 'w') as fp:
        for out in output_jsons:
            fp.write(json.dumps(out) + '\n')
Example #10
0
def main(argv):
    args = argparser().parse_args(argv[1:])
    tokenizer = BasicTokenizer(do_lower_case=args.uncased)
    for fn in args.file:
        basic_tokenize(tokenizer, fn, args)
    return 0
Example #11
0
def get_final_text(pred_text,
                   orig_text,
                   do_lower_case=True,
                   verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heruistic between
    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.

    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text