self._logged_errors.add(token.text)
            dep_label = u'NONE'
        counter[self.namespace][dep_label] += 1

    #overrides
    def tokens_to_indices(self, tokens, vocabulary, index_name):
        dep_labels = [token.dep_ or u'NONE' for token in tokens]

        return {
            index_name: [
                vocabulary.get_token_index(dep_label, self.namespace)
                for dep_label in dep_labels
            ]
        }

    #overrides
    def get_padding_token(self):
        return 0

    #overrides
    def get_padding_lengths(self, token):  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self, tokens, desired_num_tokens, padding_lengths):  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                    for key, val in list(tokens.items()))


DepLabelIndexer = TokenIndexer.register(u"dependency_label")(DepLabelIndexer)
Beispiel #2
0
        for token in tokens:
            if getattr(token, u'text_id', None) is not None:
                # `text_id` being set on the token means that we aren't using the vocab, we just use
                # this id instead.
                indices.append(token.text_id)
            else:
                text = token.text
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(text, self.namespace))

        return {index_name: indices}

    #overrides
    def get_padding_token(self)       :
        return 0

    #overrides
    def get_padding_lengths(self, token     )                  :  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items()))

SingleIdTokenIndexer = TokenIndexer.register(u"single_id")(SingleIdTokenIndexer)
Beispiel #3
0
        for token in tokens:
            if self._coarse_tags:
                tag = token.pos_
            else:
                tag = token.tag_
            if tag is None:
                tag = u'NONE'

            tags.append(tag)

        return {
            index_name:
            [vocabulary.get_token_index(tag, self._namespace) for tag in tags]
        }

    #overrides
    def get_padding_token(self):
        return 0

    #overrides
    def get_padding_lengths(self, token):  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self, tokens, desired_num_tokens, padding_lengths):  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                    for key, val in list(tokens.items()))


PosTagIndexer = TokenIndexer.register(u"pos_tag")(PosTagIndexer)
Beispiel #4
0
        if any(text is None for text in texts):
            raise ConfigurationError(u'ELMoTokenCharactersIndexer needs a tokenizer '
                                     u'that retains text')
        return {index_name: [ELMoCharacterMapper.convert_word_to_char_ids(text) for text in texts]}

    #overrides
    def get_padding_lengths(self, token           )                  :
        # pylint: disable=unused-argument
        return {}

    #overrides
    def get_padding_token(self)             :
        return []

    @staticmethod
    def _default_value_for_padding():
        return [0] * ELMoCharacterMapper.max_word_length

    #overrides
    def pad_token_sequence(self,
                           tokens                            ,
                           desired_num_tokens                ,
                           padding_lengths                )                              :
        # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key],
                                            default_value=self._default_value_for_padding))
                for key, val in list(tokens.items()))

ELMoTokenCharactersIndexer = TokenIndexer.register(u"elmo_characters")(ELMoTokenCharactersIndexer)
Beispiel #5
0
        if not tag:
            tag = u'NONE'
        counter[self._namespace][tag] += 1

    #overrides
    def tokens_to_indices(self,
                          tokens             ,
                          vocabulary            ,
                          index_name     )                        :
        tags = [u'NONE' if token.ent_type_ is None else token.ent_type_ for token in tokens]

        return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}

    #overrides
    def get_padding_token(self)       :
        return 0

    #overrides
    def get_padding_lengths(self, token     )                  :  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items()))

NerTagIndexer = TokenIndexer.register(u"ner_tag")(NerTagIndexer)
        padded_tokens = pad_sequence_to_length(
            tokens[key],
            desired_num_tokens[key],
            default_value=self.get_padding_token)

        # Pad the characters within the tokens.
        desired_token_length = padding_lengths[u'num_token_characters']
        longest_token = max(tokens[key], key=len, default=[])
        padding_value = 0
        if desired_token_length > len(longest_token):
            # Since we want to pad to greater than the longest token, we add a
            # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
            padded_tokens.append([padding_value] * desired_token_length)
        # pad the list of lists to the longest sublist, appending 0's
        padded_tokens = list(
            izip(*itertools.zip_longest(*padded_tokens,
                                        fillvalue=padding_value)))
        if desired_token_length > len(longest_token):
            # Removes the "dummy token".
            padded_tokens.pop()
        # Truncates all the tokens to the desired length, and return the result.
        return {
            key:
            [list(token[:desired_token_length]) for token in padded_tokens]
        }


TokenCharactersIndexer = TokenIndexer.register(u"characters")(
    TokenCharactersIndexer)
Beispiel #7
0
        # If there's too few tokens, just pad with zeros.
        text_tokens.extend(0 for _ in range(self.n_ctx - num_tokens))

        return {
                index_name: text_tokens,
                "{index_name}-offsets": offsets,
                # add mask here according to the original tokens,
                # because calling util.get_text_field_mask on the
                # "byte pair" tokens will produce the wrong shape
                u"mask": [1 for _ in offsets]
        }

    #overrides
    def get_padding_token(self)       :
        return 0

    #overrides
    def get_padding_lengths(self, token     )                  :  # pylint: disable=unused-argument
        return {}

    #overrides
    def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in tokens.items())

OpenaiTransformerBytePairIndexer = TokenIndexer.register(u"openai_transformer_byte_pair")(OpenaiTransformerBytePairIndexer)