self._logged_errors.add(token.text) dep_label = u'NONE' counter[self.namespace][dep_label] += 1 #overrides def tokens_to_indices(self, tokens, vocabulary, index_name): dep_labels = [token.dep_ or u'NONE' for token in tokens] return { index_name: [ vocabulary.get_token_index(dep_label, self.namespace) for dep_label in dep_labels ] } #overrides def get_padding_token(self): return 0 #overrides def get_padding_lengths(self, token): # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens, desired_num_tokens, padding_lengths): # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items())) DepLabelIndexer = TokenIndexer.register(u"dependency_label")(DepLabelIndexer)
for token in tokens: if getattr(token, u'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. indices.append(token.text_id) else: text = token.text if self.lowercase_tokens: text = text.lower() indices.append(vocabulary.get_token_index(text, self.namespace)) return {index_name: indices} #overrides def get_padding_token(self) : return 0 #overrides def get_padding_lengths(self, token ) : # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items())) SingleIdTokenIndexer = TokenIndexer.register(u"single_id")(SingleIdTokenIndexer)
for token in tokens: if self._coarse_tags: tag = token.pos_ else: tag = token.tag_ if tag is None: tag = u'NONE' tags.append(tag) return { index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags] } #overrides def get_padding_token(self): return 0 #overrides def get_padding_lengths(self, token): # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens, desired_num_tokens, padding_lengths): # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items())) PosTagIndexer = TokenIndexer.register(u"pos_tag")(PosTagIndexer)
if any(text is None for text in texts): raise ConfigurationError(u'ELMoTokenCharactersIndexer needs a tokenizer ' u'that retains text') return {index_name: [ELMoCharacterMapper.convert_word_to_char_ids(text) for text in texts]} #overrides def get_padding_lengths(self, token ) : # pylint: disable=unused-argument return {} #overrides def get_padding_token(self) : return [] @staticmethod def _default_value_for_padding(): return [0] * ELMoCharacterMapper.max_word_length #overrides def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key], default_value=self._default_value_for_padding)) for key, val in list(tokens.items())) ELMoTokenCharactersIndexer = TokenIndexer.register(u"elmo_characters")(ELMoTokenCharactersIndexer)
if not tag: tag = u'NONE' counter[self._namespace][tag] += 1 #overrides def tokens_to_indices(self, tokens , vocabulary , index_name ) : tags = [u'NONE' if token.ent_type_ is None else token.ent_type_ for token in tokens] return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]} #overrides def get_padding_token(self) : return 0 #overrides def get_padding_lengths(self, token ) : # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items())) NerTagIndexer = TokenIndexer.register(u"ner_tag")(NerTagIndexer)
padded_tokens = pad_sequence_to_length( tokens[key], desired_num_tokens[key], default_value=self.get_padding_token) # Pad the characters within the tokens. desired_token_length = padding_lengths[u'num_token_characters'] longest_token = max(tokens[key], key=len, default=[]) padding_value = 0 if desired_token_length > len(longest_token): # Since we want to pad to greater than the longest token, we add a # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest. padded_tokens.append([padding_value] * desired_token_length) # pad the list of lists to the longest sublist, appending 0's padded_tokens = list( izip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value))) if desired_token_length > len(longest_token): # Removes the "dummy token". padded_tokens.pop() # Truncates all the tokens to the desired length, and return the result. return { key: [list(token[:desired_token_length]) for token in padded_tokens] } TokenCharactersIndexer = TokenIndexer.register(u"characters")( TokenCharactersIndexer)
# If there's too few tokens, just pad with zeros. text_tokens.extend(0 for _ in range(self.n_ctx - num_tokens)) return { index_name: text_tokens, "{index_name}-offsets": offsets, # add mask here according to the original tokens, # because calling util.get_text_field_mask on the # "byte pair" tokens will produce the wrong shape u"mask": [1 for _ in offsets] } #overrides def get_padding_token(self) : return 0 #overrides def get_padding_lengths(self, token ) : # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in tokens.items()) OpenaiTransformerBytePairIndexer = TokenIndexer.register(u"openai_transformer_byte_pair")(OpenaiTransformerBytePairIndexer)