コード例 #1
0
    def __init__(self,
                 vocab_file,
                 merges_file,
                 special_tokens=None,
                 max_len=None):
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load(
                'en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning(
                "ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy."
            )
            self.nlp = BasicTokenizer(do_lower_case=True,
                                      never_split=special_tokens
                                      if special_tokens is not None else [])
            self.fix_text = None

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v: k for k, v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
        self.set_special_tokens(special_tokens)
コード例 #2
0
    def test_basic_tokenizer_lower(self):
        tokenizer = BasicTokenizer(do_lower_case=True)

        self.assertListEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["hello", "!", "how", "are", "you", "?"])
        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
コード例 #3
0
    def __init__(self,
                 entity_candidate_generators: Dict[str, MentionGenerator],
                 entity_indexers: Dict[str, TokenIndexer],
                 bert_model_type: str,
                 do_lower_case: bool,
                 whitespace_tokenize: bool = True,
                 max_word_piece_sequence_length: int = 512) -> None:
        """
        Note: the fields need to be used with a pre-generated allennlp vocabulary
        that contains the entity id namespaces and the bert name space.
        entity_indexers = {'wordnet': indexer for wordnet entities,
                          'wiki': indexer for wiki entities}
        """
        # load BertTokenizer from huggingface
        self.candidate_generators = entity_candidate_generators
        self.bert_tokenizer = BertTokenizer.from_pretrained(
            bert_model_type, do_lower_case=do_lower_case)
        self.bert_word_tokenizer = BasicTokenizer(do_lower_case=False)
        # Target length should include start and end token
        self.max_word_piece_sequence_length = max_word_piece_sequence_length

        self._entity_indexers = entity_indexers
        # for bert, we'll give an empty token indexer with empty name space
        # and do the indexing directly with the bert vocab to bypass
        # indexing in the indexer
        self._bert_single_id_indexer = {
            'tokens': SingleIdTokenIndexer('__bert__')
        }
        self.do_lowercase = do_lower_case
        self.whitespace_tokenize = whitespace_tokenize
        self.dtype = np.float32
コード例 #4
0
 def __init__(self,
              do_lower_case: bool = True,
              never_split: Optional[List[str]] = None) -> None:
     if never_split is None:
         # Let BertTokenizer use its default
         self.basic_tokenizer = BertTokenizer(do_lower_case)
     else:
         self.basic_tokenizer = BertTokenizer(do_lower_case, never_split)
コード例 #5
0
    def genTraindataEdu(self, goldEDUs):
        '''
            generate training data by labeling every word with 0(I) or 1(B) 
            1 if this word is the begining of the EDU
            0 otherwise
        '''

        p = copy.deepcopy(goldEDUs)
        # use basic tokenizer instead of bert tokenizer because bert is character
        # but we are labeling word
        tokenizer = BasicTokenizer(do_lower_case=True)

        self.traindataEdu.append([])
        self.traindataEdu[self.traindataEduCount].append(['[CLS]'])
        self.traindataEdu[self.traindataEduCount].append(['[CLS]'])

        for sent in p:
            for j, word in enumerate(list(sent)):  # split each word with space
                self.traindataEdu[self.traindataEduCount][0].append(word)
                if j == 0:
                    self.traindataEdu[self.traindataEduCount][1].append("1")
                else:
                    self.traindataEdu[self.traindataEduCount][1].append("0")
        self.traindataEdu[self.traindataEduCount][0].append('[SEP]')
        self.traindataEdu[self.traindataEduCount][1].append('[SEP]')

        self.traindataEduCount += 1
        return
コード例 #6
0
    def _bert_basic(self, text):
        if self.word_tokenizer is None:
            from pytorch_pretrained_bert.tokenization import BasicTokenizer

            self.word_tokenizer = BasicTokenizer(**self.config)

        return self.word_tokenizer.tokenize(text)
コード例 #7
0
class BertBasicWordSplitter(WordSplitter):
    """
    The ``BasicWordSplitter`` from the BERT implementation.
    This is used to split a sentence into words.
    Then the ``BertTokenIndexer`` converts each word into wordpieces.
    """
    def __init__(self, do_lower_case: bool = True) -> None:
        self.basic_tokenizer = BertTokenizer(do_lower_case)

    @overrides
    def split_words(self, sentence: str) -> List[Token]:
        return [Token(text) for text in self.basic_tokenizer.tokenize(sentence)]
コード例 #8
0
 def __init__(self):
     print('[INFO]加载分词器')
     self.processor, self.bertTokenizer = init_params()
     label_list = self.processor.get_labels()
     self.label_map = {label: i for i, label in enumerate(label_list)}
     self.tokenizer = BasicTokenizer()
     print('[INFO]分词器加载完毕')
     print('[INFO]加载模型')
     self.model = Bert_CRF()
     self.model.load_state_dict(load_model(args.output_dir))
     self.device = torch.device(args.device if torch.cuda.is_available()
                                and not args.no_cuda else "cpu")
     self.model.to(self.device)
     self.model.eval()
     print('[INFO]模型加载完毕')
コード例 #9
0
class BertPreTokenizer(Tokenizer):
    """
    The ``BasicTokenizer`` from the BERT implementation.
    This is used to split a sentence into words.
    Then the ``BertTokenIndexer`` converts each word into wordpieces.
    """

    def __init__(self, do_lower_case: bool = True, never_split: Optional[List[str]] = None) -> None:
        if never_split is None:
            # Let BertTokenizer use its default
            self.basic_tokenizer = BertTokenizer(do_lower_case)
        else:
            self.basic_tokenizer = BertTokenizer(do_lower_case, never_split)

    @overrides
    def tokenize(self, text: str) -> List[Token]:
        return [Token(text) for text in self.basic_tokenizer.tokenize(text)]
コード例 #10
0
ファイル: test.py プロジェクト: Yegiri/FewShotTagging
import random
import json
from pytorch_pretrained_bert.tokenization import BertTokenizer, BasicTokenizer

tokenizer = BertTokenizer.from_pretrained('uncased_L-12_H-768_A-12/vocab.txt')
basic_tokenizer = BasicTokenizer()

with open('ED-data/Few-Shot_ED.json', 'r') as f:
    data = json.loads(f.read())
# test1 Movement 5 labels
# test2 Conflict 4 labels
# test3 Life     10 labels
# test4 Sports   4 labels
# test5 Business 12 labels
# test6 Military 4 labels
# test7 Music    5 labels
domains = [
    'Movement', 'Conflict', 'Life', 'Sports', 'Business', 'Military', 'Music'
]

for order in range(0, 7):
    print('order', order)
    test = {}
    for test_order in range(0, 7):
        if test_order == order or test_order == (order + 1) % 7:
            continue
        B_label = domains[test_order]
        print(B_label)
        L_labels = []

        # select Movement.*
コード例 #11
0
    def test_basic_tokenizer_no_lower(self):
        tokenizer = BasicTokenizer(do_lower_case=False)

        self.assertListEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"])
コード例 #12
0
import re
import json
import random
import args
import os
from pytorch_pretrained_bert.tokenization import BasicTokenizer
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from parser import entity_extractor
import torch

random.seed(233)
tokenizer = BasicTokenizer()
test_data = 'data/test_data.pth'


class doc:
    def __init__(self, news_id, content):
        self.id = news_id
        self.title = content[0]
        self.content = content[1:]


def make_submit(doc, parser: entity_extractor):
    entity_cont = {}
    title_entity = parser.extract([doc.title])
    content_entity = parser.extract(doc.content)
    doc_entity = content_entity | title_entity
    for entity in doc_entity:
        if doc.title.find(entity) > -1:
            entity_cont[entity] = entity_cont.get(entity, 0) + 10
        for stc in doc.content:
コード例 #13
0
    def test_chinese(self):
        tokenizer = BasicTokenizer()

        self.assertListEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"),
                             [u"ah", u"\u535A", u"\u63A8", u"zz"])
コード例 #14
0
 def __init__(self, do_lower_case: bool = True) -> None:
     self.basic_tokenizer = BertTokenizer(do_lower_case)
コード例 #15
0
def test_BasicTokenizer():
    model = BasicTokenizer()
    ## 한글지원 안함
    text = "안녕hello월드ahahah國民mrhs"
    print(model._tokenize_chinese_chars(text))
    print(unicodedata.normalize("NFD", text))
コード例 #16
0
 def from_config(cls, config: Config):
     basic_tokenizer = BasicTokenizer(do_lower_case=config.lowercase)
     return cls(basic_tokenizer)
コード例 #17
0
class OpenAIGPTTokenizer(object):
    """
    BPE tokenizer. Peculiarities:
        - lower case all inputs
        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
        - argument special_tokens and function set_special_tokens:
            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
    """
    @classmethod
    def from_pretrained(cls,
                        pretrained_model_name_or_path,
                        cache_dir=None,
                        *inputs,
                        **kwargs):
        """
        Instantiate a PreTrainedBertModel from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
                pretrained_model_name_or_path]
            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[
                pretrained_model_name_or_path]
        else:
            vocab_file = os.path.join(pretrained_model_name_or_path,
                                      VOCAB_NAME)
            merges_file = os.path.join(pretrained_model_name_or_path,
                                       MERGES_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
            resolved_merges_file = cached_path(merges_file,
                                               cache_dir=cache_dir)
        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find files {} and {} "
                "at this path or url.".format(
                    pretrained_model_name_or_path,
                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                    pretrained_model_name_or_path, vocab_file, merges_file))
            return None
        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
            logger.info("loading merges file {}".format(merges_file))
        else:
            logger.info("loading vocabulary file {} from cache at {}".format(
                vocab_file, resolved_vocab_file))
            logger.info("loading merges file {} from cache at {}".format(
                merges_file, resolved_merges_file))
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
            # than the number of positional embeddings
            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
                pretrained_model_name_or_path]
            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs,
                        **kwargs)
        return tokenizer

    def __init__(self,
                 vocab_file,
                 merges_file,
                 special_tokens=None,
                 max_len=None):
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load(
                'en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning(
                "ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy."
            )
            self.nlp = BasicTokenizer(do_lower_case=True,
                                      never_split=special_tokens
                                      if special_tokens is not None else [])
            self.fix_text = None

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v: k for k, v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
        self.set_special_tokens(special_tokens)

    def __len__(self):
        return len(self.encoder) + len(self.special_tokens)

    def set_special_tokens(self, special_tokens):
        """ Add a list of additional tokens to the encoder.
            The additional tokens are indexed starting from the last index of the
            current vocabulary in the order of the `special_tokens` list.
        """
        if not special_tokens:
            self.special_tokens = {}
            self.special_tokens_decoder = {}
            return
        self.special_tokens = dict((tok, len(self.encoder) + i)
                                   for i, tok in enumerate(special_tokens))
        self.special_tokens_decoder = {
            v: k
            for k, v in self.special_tokens.items()
        }
        if self.fix_text is None:
            # Using BERT's BasicTokenizer: we can update the tokenizer
            self.nlp.never_split = special_tokens
        logger.info("Special tokens {}".format(self.special_tokens))

    def bpe(self, token):
        word = tuple(token[:-1]) + (token[-1] + '</w>', )
        if token in self.cache:
            return self.cache[token]
        pairs = get_pairs(word)

        if not pairs:
            return token + '</w>'

        while True:
            bigram = min(
                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word) - 1 and word[
                        i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        if word == '\n  </w>':
            word = '\n</w>'
        self.cache[token] = word
        return word

    def tokenize(self, text):
        """ Tokenize a string. """
        split_tokens = []
        if self.fix_text is None:
            # Using BERT's BasicTokenizer
            text = self.nlp.tokenize(text)
            for token in text:
                split_tokens.extend([t for t in self.bpe(token).split(' ')])
        else:
            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
            text = self.nlp(text_standardize(self.fix_text(text)))
            for token in text:
                split_tokens.extend(
                    [t for t in self.bpe(token.text.lower()).split(' ')])
        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        """ Converts a sequence of tokens into ids using the vocab. """
        ids = []
        if isinstance(tokens, str) or (sys.version_info[0] == 2
                                       and isinstance(tokens, unicode)):
            if tokens in self.special_tokens:
                return self.special_tokens[tokens]
            else:
                return self.encoder.get(tokens, 0)
        for token in tokens:
            if token in self.special_tokens:
                ids.append(self.special_tokens[token])
            else:
                ids.append(self.encoder.get(token, 0))
        if len(ids) > self.max_len:
            logger.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this OpenAI GPT model ({} > {}). Running this"
                " sequence through the model will result in indexing errors".
                format(len(ids), self.max_len))
        return ids

    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
        """Converts a sequence of ids in BPE tokens using the vocab."""
        tokens = []
        for i in ids:
            if i in self.special_tokens_decoder:
                if not skip_special_tokens:
                    tokens.append(self.special_tokens_decoder[i])
            else:
                tokens.append(self.decoder[i])
        return tokens

    def decode(self,
               ids,
               skip_special_tokens=False,
               clean_up_tokenization_spaces=False):
        """Converts a sequence of ids in a string."""
        tokens = self.convert_ids_to_tokens(
            ids, skip_special_tokens=skip_special_tokens)
        out_string = ''.join(tokens).replace('</w>', ' ').strip()
        if clean_up_tokenization_spaces:
            out_string = out_string.replace('<unk>', '')
            out_string = out_string.replace(' .', '.').replace(
                ' ?', '?').replace(' !', '!').replace(' ,', ',').replace(
                    ' ,',
                    ',').replace(" n't", "n't").replace(" 'm", "'m").replace(
                        " 're", "'re").replace(" do not", " don't").replace(
                            " 's", "'s").replace(" t ", "'t ").replace(
                                " s ",
                                "'s ").replace(" m ",
                                               "'m ").replace(" 've", "'ve")
        return out_string
コード例 #18
0
class BertTokenizerAndCandidateGenerator(Registrable):
    def __init__(self,
                 entity_candidate_generators: Dict[str, MentionGenerator],
                 entity_indexers: Dict[str, TokenIndexer],
                 bert_model_type: str,
                 do_lower_case: bool,
                 whitespace_tokenize: bool = True,
                 max_word_piece_sequence_length: int = 512) -> None:
        """
        Note: the fields need to be used with a pre-generated allennlp vocabulary
        that contains the entity id namespaces and the bert name space.
        entity_indexers = {'wordnet': indexer for wordnet entities,
                          'wiki': indexer for wiki entities}
        """
        # load BertTokenizer from huggingface
        self.candidate_generators = entity_candidate_generators
        self.bert_tokenizer = BertTokenizer.from_pretrained(
            bert_model_type, do_lower_case=do_lower_case)
        self.bert_word_tokenizer = BasicTokenizer(do_lower_case=False)
        # Target length should include start and end token
        self.max_word_piece_sequence_length = max_word_piece_sequence_length

        self._entity_indexers = entity_indexers
        # for bert, we'll give an empty token indexer with empty name space
        # and do the indexing directly with the bert vocab to bypass
        # indexing in the indexer
        self._bert_single_id_indexer = {
            'tokens': SingleIdTokenIndexer('__bert__')
        }
        self.do_lowercase = do_lower_case
        self.whitespace_tokenize = whitespace_tokenize
        self.dtype = np.float32

    def _word_to_word_pieces(self, word):
        if self.do_lowercase and word not in self.bert_tokenizer.basic_tokenizer.never_split:
            word = word.lower()
        return self.bert_tokenizer.wordpiece_tokenizer.tokenize(word)

    def tokenize_and_generate_candidates(self,
                                         text_a: str,
                                         text_b: str = None):
        """
        # run BertTokenizer.basic_tokenizer.tokenize on sentence1 and sentence2 to word tokenization
        # generate candidate mentions for each of the generators and for each of sentence1 and 2 from word tokenized text
        # run BertTokenizer.wordpiece_tokenizer on sentence1 and sentence2
        # truncate length, add [CLS] and [SEP] to word pieces
        # compute token offsets
        # combine candidate mention spans from sentence1 and sentence2 and remap to word piece indices

        returns:

        {'tokens': List[str], the word piece strings with [CLS] [SEP]
         'segment_ids': List[int] the same length as 'tokens' with 0/1 for sentence1 vs 2
         'candidates': Dict[str, Dict[str, Any]],
            {'wordnet': {'candidate_spans': List[List[int]],
                         'candidate_entities': List[List[str]],
                         'candidate_entity_prior': List[List[float]],
                         'segment_ids': List[int]},
             'wiki': ...}
        }
        """
        offsets_a, grouped_wp_a, tokens_a = self._tokenize_text(text_a)

        if text_b is not None:
            offsets_b, grouped_wp_b, tokens_b = self._tokenize_text(text_b)
            truncate_sequence_pair(grouped_wp_a, grouped_wp_b,
                                   self.max_word_piece_sequence_length - 3)
            offsets_b = offsets_b[:len(grouped_wp_b)]
            tokens_b = tokens_b[:len(grouped_wp_b)]
            instance_b = self._generate_sentence_entity_candidates(
                tokens_b, offsets_b)
            word_piece_tokens_b = [
                word_piece for word in grouped_wp_b for word_piece in word
            ]
        else:
            length_a = sum([len(x) for x in grouped_wp_a])
            while self.max_word_piece_sequence_length - 2 < length_a:
                discarded = grouped_wp_a.pop()
                length_a -= len(discarded)

        word_piece_tokens_a = [
            word_piece for word in grouped_wp_a for word_piece in word
        ]
        offsets_a = offsets_a[:len(grouped_wp_a)]
        tokens_a = tokens_a[:len(grouped_wp_a)]
        instance_a = self._generate_sentence_entity_candidates(
            tokens_a, offsets_a)

        # If we got 2 sentences.
        if text_b is not None:
            # Target length should include start and two end tokens, and then be divided equally between both sentences
            # Note that this will result in potentially shorter documents than original target length,
            # if one (or both) of the sentences are shorter than half the target length.
            tokens = [start_token] + word_piece_tokens_a + [
                sep_token
            ] + word_piece_tokens_b + [sep_token]
            segment_ids = (len(word_piece_tokens_a) +
                           2) * [0] + (len(word_piece_tokens_b) + 1) * [1]
            offsets_a = [x + 1 for x in offsets_a]
            offsets_b = [x + 2 + len(word_piece_tokens_a) for x in offsets_b]
        # Single sentence
        else:
            tokens = [start_token] + word_piece_tokens_a + [sep_token]
            segment_ids = len(tokens) * [0]
            offsets_a = [x + 1 for x in offsets_a]
            offsets_b = None

        for name in instance_a.keys():
            for span in instance_a[name]['candidate_spans']:
                span[0] += 1
                span[1] += 1

        fields: Dict[str, Sequence] = {}

        # concatanating both sentences (for both tokens and ids)
        if text_b is None:
            candidates = instance_a
        else:
            candidates: Dict[str, Field] = {}

            # Merging candidate lists for both sentences.
            for entity_type in instance_b:
                candidate_instance_a = instance_a[entity_type]
                candidate_instance_b = instance_b[entity_type]

                candidates[entity_type] = {}

                for span in candidate_instance_b['candidate_spans']:
                    span[0] += len(word_piece_tokens_a) + 2
                    span[1] += len(word_piece_tokens_a) + 2

                # Merging each of the fields.
                for key in [
                        'candidate_entities', 'candidate_spans',
                        'candidate_entity_priors'
                ]:
                    candidates[entity_type][key] = candidate_instance_a[
                        key] + candidate_instance_b[key]

        for entity_type in candidates.keys():
            # deal with @@PADDING@@
            if len(candidates[entity_type]['candidate_entities']) == 0:
                candidates[entity_type] = get_empty_candidates()
            else:
                padding_indices = []
                has_entity = False
                for cand_i, candidate_list in enumerate(
                        candidates[entity_type]['candidate_entities']):
                    if candidate_list == ["@@PADDING@@"]:
                        padding_indices.append(cand_i)
                        candidates[entity_type]["candidate_spans"][cand_i] = [
                            -1, -1
                        ]
                    else:
                        has_entity = True
                indices_to_remove = []
                if has_entity and len(padding_indices) > 0:
                    # remove all the padding entities since have some valid
                    indices_to_remove = padding_indices
                elif len(padding_indices) > 0:
                    assert len(padding_indices) == len(
                        candidates[entity_type]['candidate_entities'])
                    indices_to_remove = padding_indices[1:]
                for ind in reversed(indices_to_remove):
                    del candidates[entity_type]["candidate_spans"][ind]
                    del candidates[entity_type]["candidate_entities"][ind]
                    del candidates[entity_type]["candidate_entity_priors"][ind]

        # get the segment ids for the spans
        for key, cands in candidates.items():
            span_segment_ids = []
            for candidate_span in cands['candidate_spans']:
                span_segment_ids.append(segment_ids[candidate_span[0]])
            candidates[key]['candidate_segment_ids'] = span_segment_ids

        fields['tokens'] = tokens
        fields['segment_ids'] = segment_ids
        fields['candidates'] = candidates
        fields['offsets_a'] = offsets_a
        fields['offsets_b'] = offsets_b
        return fields

    def _tokenize_text(self, text):
        if self.whitespace_tokenize:
            tokens = text.split()
        else:
            tokens = self.bert_word_tokenizer.tokenize(text)

        word_piece_tokens = []
        offsets = [0]
        for token in tokens:
            word_pieces = self._word_to_word_pieces(token)
            offsets.append(offsets[-1] + len(word_pieces))
            word_piece_tokens.append(word_pieces)
        del offsets[0]
        return offsets, word_piece_tokens, tokens

    def _generate_sentence_entity_candidates(self, tokens, offsets):
        """
        Tokenize sentence, trim it to the target length, and generate entity candidates.
        :param sentence
        :param target_length: The length of the output sentence in terms of word pieces.
        :return: Dict[str, Dict[str, Any]],
            {'wordnet': {'candidate_spans': List[List[int]],
                         'candidate_entities': List[List[str]],
                         'candidate_entity_priors': List[List[float]]},
             'wiki': ...}

        """
        assert len(tokens) == len(
            offsets
        ), f'Length of tokens {len(tokens)} must equal that of offsets {len(offsets)}.'
        entity_instances = {}
        for name, mention_generator in self.candidate_generators.items():
            entity_instances[name] = mention_generator.get_mentions_raw_text(
                ' '.join(tokens), whitespace_tokenize=True)

        for name, entities in entity_instances.items():
            candidate_spans = entities["candidate_spans"]
            adjusted_spans = []
            for start, end in candidate_spans:
                if 0 < start:
                    adjusted_span = [offsets[start - 1], offsets[end] - 1]
                else:
                    adjusted_span = [0, offsets[end] - 1]
                adjusted_spans.append(adjusted_span)
            entities['candidate_spans'] = adjusted_spans
            entity_instances[name] = entities
        return entity_instances

    def convert_tokens_candidates_to_fields(self, tokens_and_candidates):
        """
        tokens_and_candidates is the return from a previous call to
        generate_sentence_entity_candidates.  Converts the dict to
        a dict of fields usable with allennlp.
        """
        fields = {}

        fields['tokens'] = TextField(
            [
                Token(t, text_id=self.bert_tokenizer.vocab[t])
                for t in tokens_and_candidates['tokens']
            ],
            token_indexers=self._bert_single_id_indexer)

        fields['segment_ids'] = ArrayField(np.array(
            tokens_and_candidates['segment_ids']),
                                           dtype=np.int)

        all_candidates = {}
        for key, entity_candidates in tokens_and_candidates[
                'candidates'].items():
            # pad the prior to create the array field
            # make a copy to avoid modifying the input
            candidate_entity_prior = copy.deepcopy(
                entity_candidates['candidate_entity_priors'])
            max_cands = max(len(p) for p in candidate_entity_prior)
            for p in candidate_entity_prior:
                if len(p) < max_cands:
                    p.extend([0.0] * (max_cands - len(p)))
            np_prior = np.array(candidate_entity_prior)

            candidate_fields = {
                "candidate_entity_priors":
                ArrayField(np_prior, dtype=self.dtype),
                "candidate_entities":
                TextField([
                    Token(" ".join(candidate_list)) for candidate_list in
                    entity_candidates["candidate_entities"]
                ],
                          token_indexers={'ids': self._entity_indexers[key]}),
                "candidate_spans":
                ListField([
                    SpanField(span[0], span[1], fields['tokens'])
                    for span in entity_candidates['candidate_spans']
                ]),
                "candidate_segment_ids":
                ArrayField(np.array(
                    entity_candidates['candidate_segment_ids']),
                           dtype=np.int)
            }
            all_candidates[key] = DictField(candidate_fields)

        fields["candidates"] = DictField(all_candidates)

        return fields
コード例 #19
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""
    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text,
                tok_ns_text,
            )
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
コード例 #20
0
ファイル: word_splitter.py プロジェクト: ziaridoy20/allennlp
 def __init__(self, do_lower_case: bool = True) -> None:
     self.basic_tokenizer = BertTokenizer(do_lower_case)
コード例 #21
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heruistic between
    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
コード例 #22
0
            with open(file_stub + '.json', 'w') as f:
                json.dump(dump_dicts[i], f)

    else:
        file_stub = 'tf_idf/%s_dream_test_tfidf_agnostic_all_sorted' % args.prefix
        with open(file_stub + '.json', 'w') as f:
            json.dump(dump_dicts[0], f)


if __name__ == '__main__':
    # Parse Args
    arguments = parse_args()

    # Load BERT Tokenizer
    bert_tokenizer = BertTokenizer.from_pretrained(arguments.pretrained, do_lower_case=True)
    basic_tokenizer = BasicTokenizer(do_lower_case=False)

    # Create Dataset
    if arguments.dataset == 'race':
        D, PA = parse_data(arguments, bert_tokenizer, basic_tokenizer)

        # Compute TF Matrix
        TF = compute_tf(PA)

        # Compute TF-IDF Matrix
        print('\nComputing TF-IDF Matrix...')
        transformer = TfidfTransformer()
        TF_IDF = transformer.fit_transform(TF)
        assert(TF_IDF.shape[0] == len(PA) == len(TF))

        # Compute Scoring Matrix