Beispiel #1
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              columns_header: List = COLUMNS_HEADER,
              use_header: str = "ner",
              ignore_tag: str = None,
              input_scheme: str = 'IOB1',
              tag_scheme: str = 'IOB2',
              field_sep: str = None,
              encoding: str = "latin-1",
              lm_task: bool = False,
              start_end: bool = False,
              max_characters_per_token: int = 50,
              lazy: bool = True) -> None:
     super(CustomConll, self).__init__(lazy=lazy)
     self._token_indexers = token_indexers or {'tokens': TokenIndexer()}
     self._columns_header = columns_header
     self._use_header = use_header
     self._ignore_tag = ignore_tag
     self._input_scheme = input_scheme
     self._tag_scheme = tag_scheme
     self._field_sep = field_sep
     self._encoding = encoding
     self._lm_task = lm_task
     self._start_end = start_end
     self._max_characters_per_token = max_characters_per_token
Beispiel #2
0
 def from_params(cls, params: Params) -> 'LanguageModelingReader':
     tokens_per_instance = params.pop_int('tokens_per_instance', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return LanguageModelingReader(tokens_per_instance=tokens_per_instance,
                                   tokenizer=tokenizer,
                                   token_indexers=token_indexers)
 def from_params(cls, params: Params) -> 'LanguageModelingReader':
     tokens_per_instance = params.pop_int('tokens_per_instance', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return LanguageModelingReader(tokens_per_instance=tokens_per_instance,
                                   tokenizer=tokenizer,
                                   token_indexers=token_indexers,
                                   lazy=lazy)
Beispiel #4
0
 def from_params(cls, params: Params) -> 'SnliReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
Beispiel #5
0
    def __init__(self,
                 model_name: str,
                 namespace: str = "tags",
                 max_length: int = None,
                 **kwargs) -> None:
        TokenIndexer.__init__(self, **kwargs)
        self._namespace = namespace
        self._allennlp_tokenizer = PretrainedChineseBertTokenizer(model_name)
        self._tokenizer = self._allennlp_tokenizer.tokenizer
        self._added_to_vocabulary = False

        self._num_added_start_tokens = len(self._allennlp_tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(self._allennlp_tokenizer.single_sequence_end_tokens)

        self._max_length = max_length
        if self._max_length is not None:
            num_added_tokens = len(self._allennlp_tokenizer.tokenize("a")) - 1
            self._effective_max_length = (  # we need to take into account special tokens
                self._max_length - num_added_tokens
            )
            if self._effective_max_length <= 0:
                raise ValueError(
                    "max_length needs to be greater than the number of special tokens inserted."
                )
 def from_params(cls, params: Params) -> 'SrlReader':
     """
     Parameters
     ----------
     token_indexers: ``List[Params]``, optional
     """
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SrlReader(token_indexers=token_indexers)
Beispiel #7
0
 def from_params(cls, params: Params) -> 'SquadReader':
     """
     Parameters
     ----------
     tokenizer : ``Params``, optional (default=``{}``)
     token_indexers: ``Params``, optional (default=``{}``)
     """
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', {})
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class, so if no parameters are given we
     # must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return cls(tokenizer=tokenizer, token_indexers=token_indexers)
Beispiel #8
0
 def from_params(cls, params: Params) -> 'TargzReaders':
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     vocab_file = params.pop('vocab_file')
     mentions_tarfile = params.pop('mentions_tarfile')
     compression_mode = params.pop('compression_mode', 'gz')
     encoding = params.pop('encoding', 'utf-8')
     start_end = params.pop('start_end', False)
     label_map = params.pop('label_map', LABEL_MAP)
     lm_task = params.pop('lm_task', False)
     params.assert_empty(cls.__name__)
     return TargzReaders(token_indexers=token_indexers,
                         vocab_file=vocab_file,
                         mentions_tarfile=mentions_tarfile,
                         compression_mode=compression_mode,
                         label_map=label_map,
                         encoding=encoding,
                         lm_task=lm_task,
                         start_end=start_end)
Beispiel #9
0
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 vocab_file: str = None,
                 mentions_tarfile: str = None,
                 compression_mode: str = 'gz',
                 label_map: List[str] = None,
                 encoding: str = "utf-8",
                 lm_task: bool = False,
                 start_end: bool = False,
                 lazy: bool = True) -> None:
        super(TargzReaders, self).__init__(lazy=lazy)
        self._token_indexers = token_indexers or {'tokens': TokenIndexer()}
        self._mentions_tarfile = mentions_tarfile
        self._compression_mode = compression_mode
        self._vocab_file = vocab_file
        self._encoding = encoding
        self._lm_task = lm_task
        self._start_end = start_end
        self._label_map = label_map

        self.vocab = []
        self.load_vocab()
Beispiel #10
0
 def from_params(cls, params: Params) -> 'LanguageModelingReader':
     """
     Parameters
     ----------
     filename : ``str``
     tokens_per_instance : ``int``, optional (default=``None``)
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     tokens_per_instance = params.pop('tokens_per_instance', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return LanguageModelingReader(tokens_per_instance=tokens_per_instance,
                                   tokenizer=tokenizer,
                                   token_indexers=token_indexers)
Beispiel #11
0
    def from_params(cls, params: Params) -> 'SequenceTaggingDatasetReader':
        """
        Parameters
        ----------
        token_indexers: ``Dict[Params]``, optional
        """
        token_indexers = {}
        token_indexer_params = params.pop('token_indexers', Params({}))
        for name, indexer_params in token_indexer_params.items():
            token_indexers[name] = TokenIndexer.from_params(indexer_params)
        # The default parameters are contained within the class,
        # so if no parameters are given we must pass None.
        if token_indexers == {}:
            token_indexers = None

        word_tag_delimiter = params.pop("word_tag_delimiter",
                                        DEFAULT_WORD_TAG_DELIMITER)
        token_delimiter = params.pop("token_delimiter", None)

        params.assert_empty(cls.__name__)
        return SequenceTaggingDatasetReader(
            token_indexers=token_indexers,
            word_tag_delimiter=word_tag_delimiter,
            token_delimiter=token_delimiter)
Beispiel #12
0
 def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader':
     """
     Parameters
     ----------
     negative_sentence_selection : ``str``, optional (default="paragraph")
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     negative_sentence_selection = params.pop('negative_sentence_selection',
                                              'paragraph')
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SquadSentenceSelectionReader(
         negative_sentence_selection=negative_sentence_selection,
         tokenizer=tokenizer,
         token_indexers=token_indexers)
Beispiel #13
0
 def from_params(cls, params: Params) -> 'CustomConll':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     columns_header = params.pop('columns_header', COLUMNS_HEADER)
     use_header = params.pop('use_header', 'ner')
     encoding = params.pop('encoding', 'latin-1')
     ignore_tag = params.pop('ignore_tag', None)
     input_scheme = params.pop('input_scheme', 'IOB1')
     tag_scheme = params.pop('tag_scheme', 'IOB2')
     field_sep = params.pop('field_sep', None)
     lm_task = params.pop('lm_task', False)
     max_characters_per_token = params.pop('max_characters_per_token', 50)
     start_end = params.pop('start_end', False)
     params.assert_empty(cls.__name__)
     return CustomConll(token_indexers=token_indexers,
                        columns_header=columns_header,
                        use_header=use_header,
                        ignore_tag=ignore_tag,
                        input_scheme=input_scheme,
                        tag_scheme=tag_scheme,
                        field_sep=field_sep,
                        encoding=encoding,
                        lm_task=lm_task,
                        max_characters_per_token=max_characters_per_token, 
                        start_end=start_end)
Beispiel #14
0
        if not tag:
            tag = u'NONE'
        counter[self._namespace][tag] += 1

    #overrides
    def tokens_to_indices(self,
                          tokens             ,
                          vocabulary            ,
                          index_name     )                        :
        tags = [u'NONE' if token.ent_type_ is None else token.ent_type_ for token in tokens]

        return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}

    #overrides
    def get_padding_token(self)       :
        return 0

    #overrides
    def get_padding_lengths(self, token     )                  :  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items()))

NerTagIndexer = TokenIndexer.register(u"ner_tag")(NerTagIndexer)
Beispiel #15
0
 def test_registry_has_builtin_token_indexers(self):
     assert TokenIndexer.by_name("single_id").__name__ == "SingleIdTokenIndexer"
     assert TokenIndexer.by_name("characters").__name__ == "TokenCharactersIndexer"
Beispiel #16
0
        for token in tokens:
            if getattr(token, u'text_id', None) is not None:
                # `text_id` being set on the token means that we aren't using the vocab, we just use
                # this id instead.
                indices.append(token.text_id)
            else:
                text = token.text
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(text, self.namespace))

        return {index_name: indices}

    #overrides
    def get_padding_token(self)       :
        return 0

    #overrides
    def get_padding_lengths(self, token     )                  :  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items()))

SingleIdTokenIndexer = TokenIndexer.register(u"single_id")(SingleIdTokenIndexer)
                self._logged_errors.add(token.text)
            dep_label = u'NONE'
        counter[self.namespace][dep_label] += 1

    #overrides
    def tokens_to_indices(self, tokens, vocabulary, index_name):
        dep_labels = [token.dep_ or u'NONE' for token in tokens]

        return {
            index_name: [
                vocabulary.get_token_index(dep_label, self.namespace)
                for dep_label in dep_labels
            ]
        }

    #overrides
    def get_padding_token(self):
        return 0

    #overrides
    def get_padding_lengths(self, token):  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self, tokens, desired_num_tokens, padding_lengths):  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                    for key, val in list(tokens.items()))


DepLabelIndexer = TokenIndexer.register(u"dependency_label")(DepLabelIndexer)
 def test_registry_has_builtin_token_indexers(self):
     assert TokenIndexer.by_name('single_id').__name__ == 'SingleIdTokenIndexer'
     assert TokenIndexer.by_name('characters').__name__ == 'TokenCharactersIndexer'
        padded_tokens = pad_sequence_to_length(
            tokens[key],
            desired_num_tokens[key],
            default_value=self.get_padding_token)

        # Pad the characters within the tokens.
        desired_token_length = padding_lengths[u'num_token_characters']
        longest_token = max(tokens[key], key=len, default=[])
        padding_value = 0
        if desired_token_length > len(longest_token):
            # Since we want to pad to greater than the longest token, we add a
            # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
            padded_tokens.append([padding_value] * desired_token_length)
        # pad the list of lists to the longest sublist, appending 0's
        padded_tokens = list(
            izip(*itertools.zip_longest(*padded_tokens,
                                        fillvalue=padding_value)))
        if desired_token_length > len(longest_token):
            # Removes the "dummy token".
            padded_tokens.pop()
        # Truncates all the tokens to the desired length, and return the result.
        return {
            key:
            [list(token[:desired_token_length]) for token in padded_tokens]
        }


TokenCharactersIndexer = TokenIndexer.register(u"characters")(
    TokenCharactersIndexer)
Beispiel #20
0
        for token in tokens:
            if self._coarse_tags:
                tag = token.pos_
            else:
                tag = token.tag_
            if tag is None:
                tag = u'NONE'

            tags.append(tag)

        return {
            index_name:
            [vocabulary.get_token_index(tag, self._namespace) for tag in tags]
        }

    #overrides
    def get_padding_token(self):
        return 0

    #overrides
    def get_padding_lengths(self, token):  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self, tokens, desired_num_tokens, padding_lengths):  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                    for key, val in list(tokens.items()))


PosTagIndexer = TokenIndexer.register(u"pos_tag")(PosTagIndexer)
Beispiel #21
0
        if any(text is None for text in texts):
            raise ConfigurationError(u'ELMoTokenCharactersIndexer needs a tokenizer '
                                     u'that retains text')
        return {index_name: [ELMoCharacterMapper.convert_word_to_char_ids(text) for text in texts]}

    #overrides
    def get_padding_lengths(self, token           )                  :
        # pylint: disable=unused-argument
        return {}

    #overrides
    def get_padding_token(self)             :
        return []

    @staticmethod
    def _default_value_for_padding():
        return [0] * ELMoCharacterMapper.max_word_length

    #overrides
    def pad_token_sequence(self,
                           tokens                            ,
                           desired_num_tokens                ,
                           padding_lengths                )                              :
        # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key],
                                            default_value=self._default_value_for_padding))
                for key, val in list(tokens.items()))

ELMoTokenCharactersIndexer = TokenIndexer.register(u"elmo_characters")(ELMoTokenCharactersIndexer)
 def test_registry_has_builtin_token_indexers(self):
     assert TokenIndexer.by_name('single_id').__name__ == 'SingleIdTokenIndexer'
     assert TokenIndexer.by_name('characters').__name__ == 'TokenCharactersIndexer'
Beispiel #23
0
        # If there's too few tokens, just pad with zeros.
        text_tokens.extend(0 for _ in range(self.n_ctx - num_tokens))

        return {
                index_name: text_tokens,
                "{index_name}-offsets": offsets,
                # add mask here according to the original tokens,
                # because calling util.get_text_field_mask on the
                # "byte pair" tokens will produce the wrong shape
                u"mask": [1 for _ in offsets]
        }

    #overrides
    def get_padding_token(self)       :
        return 0

    #overrides
    def get_padding_lengths(self, token     )                  :  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in tokens.items())

OpenaiTransformerBytePairIndexer = TokenIndexer.register(u"openai_transformer_byte_pair")(OpenaiTransformerBytePairIndexer)