def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, columns_header: List = COLUMNS_HEADER, use_header: str = "ner", ignore_tag: str = None, input_scheme: str = 'IOB1', tag_scheme: str = 'IOB2', field_sep: str = None, encoding: str = "latin-1", lm_task: bool = False, start_end: bool = False, max_characters_per_token: int = 50, lazy: bool = True) -> None: super(CustomConll, self).__init__(lazy=lazy) self._token_indexers = token_indexers or {'tokens': TokenIndexer()} self._columns_header = columns_header self._use_header = use_header self._ignore_tag = ignore_tag self._input_scheme = input_scheme self._tag_scheme = tag_scheme self._field_sep = field_sep self._encoding = encoding self._lm_task = lm_task self._start_end = start_end self._max_characters_per_token = max_characters_per_token
def from_params(cls, params: Params) -> 'LanguageModelingReader': tokens_per_instance = params.pop_int('tokens_per_instance', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return LanguageModelingReader(tokens_per_instance=tokens_per_instance, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'LanguageModelingReader': tokens_per_instance = params.pop_int('tokens_per_instance', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return LanguageModelingReader(tokens_per_instance=tokens_per_instance, tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'SnliReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None, **kwargs) -> None: TokenIndexer.__init__(self, **kwargs) self._namespace = namespace self._allennlp_tokenizer = PretrainedChineseBertTokenizer(model_name) self._tokenizer = self._allennlp_tokenizer.tokenizer self._added_to_vocabulary = False self._num_added_start_tokens = len(self._allennlp_tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(self._allennlp_tokenizer.single_sequence_end_tokens) self._max_length = max_length if self._max_length is not None: num_added_tokens = len(self._allennlp_tokenizer.tokenize("a")) - 1 self._effective_max_length = ( # we need to take into account special tokens self._max_length - num_added_tokens ) if self._effective_max_length <= 0: raise ValueError( "max_length needs to be greater than the number of special tokens inserted." )
def from_params(cls, params: Params) -> 'SrlReader': """ Parameters ---------- token_indexers: ``List[Params]``, optional """ token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SrlReader(token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SquadReader': """ Parameters ---------- tokenizer : ``Params``, optional (default=``{}``) token_indexers: ``Params``, optional (default=``{}``) """ tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', {}) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, so if no parameters are given we # must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'TargzReaders': token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) vocab_file = params.pop('vocab_file') mentions_tarfile = params.pop('mentions_tarfile') compression_mode = params.pop('compression_mode', 'gz') encoding = params.pop('encoding', 'utf-8') start_end = params.pop('start_end', False) label_map = params.pop('label_map', LABEL_MAP) lm_task = params.pop('lm_task', False) params.assert_empty(cls.__name__) return TargzReaders(token_indexers=token_indexers, vocab_file=vocab_file, mentions_tarfile=mentions_tarfile, compression_mode=compression_mode, label_map=label_map, encoding=encoding, lm_task=lm_task, start_end=start_end)
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, vocab_file: str = None, mentions_tarfile: str = None, compression_mode: str = 'gz', label_map: List[str] = None, encoding: str = "utf-8", lm_task: bool = False, start_end: bool = False, lazy: bool = True) -> None: super(TargzReaders, self).__init__(lazy=lazy) self._token_indexers = token_indexers or {'tokens': TokenIndexer()} self._mentions_tarfile = mentions_tarfile self._compression_mode = compression_mode self._vocab_file = vocab_file self._encoding = encoding self._lm_task = lm_task self._start_end = start_end self._label_map = label_map self.vocab = [] self.load_vocab()
def from_params(cls, params: Params) -> 'LanguageModelingReader': """ Parameters ---------- filename : ``str`` tokens_per_instance : ``int``, optional (default=``None``) tokenizer : ``Params``, optional token_indexers: ``List[Params]``, optional """ tokens_per_instance = params.pop('tokens_per_instance', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return LanguageModelingReader(tokens_per_instance=tokens_per_instance, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SequenceTaggingDatasetReader': """ Parameters ---------- token_indexers: ``Dict[Params]``, optional """ token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None word_tag_delimiter = params.pop("word_tag_delimiter", DEFAULT_WORD_TAG_DELIMITER) token_delimiter = params.pop("token_delimiter", None) params.assert_empty(cls.__name__) return SequenceTaggingDatasetReader( token_indexers=token_indexers, word_tag_delimiter=word_tag_delimiter, token_delimiter=token_delimiter)
def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader': """ Parameters ---------- negative_sentence_selection : ``str``, optional (default="paragraph") tokenizer : ``Params``, optional token_indexers: ``List[Params]``, optional """ negative_sentence_selection = params.pop('negative_sentence_selection', 'paragraph') tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SquadSentenceSelectionReader( negative_sentence_selection=negative_sentence_selection, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'CustomConll': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) columns_header = params.pop('columns_header', COLUMNS_HEADER) use_header = params.pop('use_header', 'ner') encoding = params.pop('encoding', 'latin-1') ignore_tag = params.pop('ignore_tag', None) input_scheme = params.pop('input_scheme', 'IOB1') tag_scheme = params.pop('tag_scheme', 'IOB2') field_sep = params.pop('field_sep', None) lm_task = params.pop('lm_task', False) max_characters_per_token = params.pop('max_characters_per_token', 50) start_end = params.pop('start_end', False) params.assert_empty(cls.__name__) return CustomConll(token_indexers=token_indexers, columns_header=columns_header, use_header=use_header, ignore_tag=ignore_tag, input_scheme=input_scheme, tag_scheme=tag_scheme, field_sep=field_sep, encoding=encoding, lm_task=lm_task, max_characters_per_token=max_characters_per_token, start_end=start_end)
if not tag: tag = u'NONE' counter[self._namespace][tag] += 1 #overrides def tokens_to_indices(self, tokens , vocabulary , index_name ) : tags = [u'NONE' if token.ent_type_ is None else token.ent_type_ for token in tokens] return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]} #overrides def get_padding_token(self) : return 0 #overrides def get_padding_lengths(self, token ) : # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items())) NerTagIndexer = TokenIndexer.register(u"ner_tag")(NerTagIndexer)
def test_registry_has_builtin_token_indexers(self): assert TokenIndexer.by_name("single_id").__name__ == "SingleIdTokenIndexer" assert TokenIndexer.by_name("characters").__name__ == "TokenCharactersIndexer"
for token in tokens: if getattr(token, u'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. indices.append(token.text_id) else: text = token.text if self.lowercase_tokens: text = text.lower() indices.append(vocabulary.get_token_index(text, self.namespace)) return {index_name: indices} #overrides def get_padding_token(self) : return 0 #overrides def get_padding_lengths(self, token ) : # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items())) SingleIdTokenIndexer = TokenIndexer.register(u"single_id")(SingleIdTokenIndexer)
self._logged_errors.add(token.text) dep_label = u'NONE' counter[self.namespace][dep_label] += 1 #overrides def tokens_to_indices(self, tokens, vocabulary, index_name): dep_labels = [token.dep_ or u'NONE' for token in tokens] return { index_name: [ vocabulary.get_token_index(dep_label, self.namespace) for dep_label in dep_labels ] } #overrides def get_padding_token(self): return 0 #overrides def get_padding_lengths(self, token): # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens, desired_num_tokens, padding_lengths): # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items())) DepLabelIndexer = TokenIndexer.register(u"dependency_label")(DepLabelIndexer)
def test_registry_has_builtin_token_indexers(self): assert TokenIndexer.by_name('single_id').__name__ == 'SingleIdTokenIndexer' assert TokenIndexer.by_name('characters').__name__ == 'TokenCharactersIndexer'
padded_tokens = pad_sequence_to_length( tokens[key], desired_num_tokens[key], default_value=self.get_padding_token) # Pad the characters within the tokens. desired_token_length = padding_lengths[u'num_token_characters'] longest_token = max(tokens[key], key=len, default=[]) padding_value = 0 if desired_token_length > len(longest_token): # Since we want to pad to greater than the longest token, we add a # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest. padded_tokens.append([padding_value] * desired_token_length) # pad the list of lists to the longest sublist, appending 0's padded_tokens = list( izip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value))) if desired_token_length > len(longest_token): # Removes the "dummy token". padded_tokens.pop() # Truncates all the tokens to the desired length, and return the result. return { key: [list(token[:desired_token_length]) for token in padded_tokens] } TokenCharactersIndexer = TokenIndexer.register(u"characters")( TokenCharactersIndexer)
for token in tokens: if self._coarse_tags: tag = token.pos_ else: tag = token.tag_ if tag is None: tag = u'NONE' tags.append(tag) return { index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags] } #overrides def get_padding_token(self): return 0 #overrides def get_padding_lengths(self, token): # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens, desired_num_tokens, padding_lengths): # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items())) PosTagIndexer = TokenIndexer.register(u"pos_tag")(PosTagIndexer)
if any(text is None for text in texts): raise ConfigurationError(u'ELMoTokenCharactersIndexer needs a tokenizer ' u'that retains text') return {index_name: [ELMoCharacterMapper.convert_word_to_char_ids(text) for text in texts]} #overrides def get_padding_lengths(self, token ) : # pylint: disable=unused-argument return {} #overrides def get_padding_token(self) : return [] @staticmethod def _default_value_for_padding(): return [0] * ELMoCharacterMapper.max_word_length #overrides def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key], default_value=self._default_value_for_padding)) for key, val in list(tokens.items())) ELMoTokenCharactersIndexer = TokenIndexer.register(u"elmo_characters")(ELMoTokenCharactersIndexer)
def test_registry_has_builtin_token_indexers(self): assert TokenIndexer.by_name('single_id').__name__ == 'SingleIdTokenIndexer' assert TokenIndexer.by_name('characters').__name__ == 'TokenCharactersIndexer'
# If there's too few tokens, just pad with zeros. text_tokens.extend(0 for _ in range(self.n_ctx - num_tokens)) return { index_name: text_tokens, "{index_name}-offsets": offsets, # add mask here according to the original tokens, # because calling util.get_text_field_mask on the # "byte pair" tokens will produce the wrong shape u"mask": [1 for _ in offsets] } #overrides def get_padding_token(self) : return 0 #overrides def get_padding_lengths(self, token ) : # pylint: disable=unused-argument return {} #overrides def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in tokens.items()) OpenaiTransformerBytePairIndexer = TokenIndexer.register(u"openai_transformer_byte_pair")(OpenaiTransformerBytePairIndexer)