def test_handles_byte_encoding(self): tokenizer = CharacterTokenizer(byte_encoding='utf-8', start_tokens=[259], end_tokens=[260]) word = "åøâáabe" tokens = [t.text_id for t in tokenizer.tokenize(word)] # Note that we've added one to the utf-8 encoded bytes, to account for masking. expected_tokens = [259, 196, 166, 196, 185, 196, 163, 196, 162, 98, 99, 102, 260] assert tokens == expected_tokens
def test_splits_into_characters(self): tokenizer = CharacterTokenizer(start_tokens=['<S1>', '<S2>'], end_tokens=['</S2>', '</S1>']) sentence = "A, small sentence." tokens = [t.text for t in tokenizer.tokenize(sentence)] expected_tokens = ["<S1>", "<S2>", "A", ",", " ", "s", "m", "a", "l", "l", " ", "s", "e", "n", "t", "e", "n", "c", "e", ".", '</S2>', '</S1>'] assert tokens == expected_tokens
def test_non_word_ending_suffix(self): tokenizer = CharacterTokenizer(non_word_end_suffix="##") sent = "A, small sentence." tokens = [t.text for t in tokenizer.tokenize(sent)] expected_tokens = [ "A##", ",", "s##", "m##", "a##", "l##", "l", "s##", "e##", "n##", "t##", "e##", "n##", "c##", "e##", "." ] assert tokens == expected_tokens
def test_splits_into_characters(self): tokenizer = CharacterTokenizer(start_tokens=[u'<S1>', u'<S2>'], end_tokens=[u'</S2>', u'</S1>']) sentence = u"A, small sentence." tokens = [t.text for t in tokenizer.tokenize(sentence)] expected_tokens = [ u"<S1>", u"<S2>", u"A", u",", u" ", u"s", u"m", u"a", u"l", u"l", u" ", u"s", u"e", u"n", u"t", u"e", u"n", u"c", u"e", u".", u'</S2>', u'</S1>' ] assert tokens == expected_tokens
def test_to_params(self): tokenizer = CharacterTokenizer(byte_encoding="utf-8", start_tokens=[259], end_tokens=[260]) params = tokenizer.to_params() assert isinstance(params, Params) assert params.params == { "type": "character", "byte_encoding": "utf-8", "end_tokens": [260], "start_tokens": [259], "lowercase_characters": False, }
def read_dataset(all_chars: Set[str]=None) -> List[List[Token]]: tokenizer = CharacterTokenizer() sentences = [] with open('data/mt/sentences.eng.10k.txt') as f: for line in f: line = line.strip() if not line: continue line = re.sub(' +', ' ', line) tokens = tokenizer.tokenize(line) if all_chars: tokens = [token for token in tokens if token.text in all_chars] sentences.append(tokens) return sentences
def __init__( self, bpps_dir: Optional[str] = None, max_sequence_length: Optional[int] = None, add_start_end_tokens: bool = True, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._bpps_dir = bpps_dir if bpps_dir is None else Path(bpps_dir) self._max_sequence_length = max_sequence_length or math.inf self._add_start_end_tokens = add_start_end_tokens self._tokenizer = CharacterTokenizer() self._start_token = Token(START_TOKEN) self._end_token = Token(END_TOKEN)
def __init__(self, model_path, top_k=3, cuda_device=-1): archive = load_archive(model_path, cuda_device=cuda_device) config = archive.config prepare_environment(config) model = archive.model model.eval() self.model = model self._tokenizer = CharacterTokenizer() self._token_indexers = {'tokens': SingleIdTokenIndexer()} self._id_to_label = model.vocab.get_index_to_token_vocabulary( namespace='labels') self._top_k = top_k
class SentenceSegmentPredictor(Predictor): def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._character_tokenizer = CharacterTokenizer() def predict(self, sentence: str) -> JsonDict: results = self.predict_json({"sentence": sentence}) return results @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: sentence = json_dict['sentence'] character_tokens = self._character_tokenizer.tokenize(sentence) return self._dataset_reader.text_to_instance(character_tokens) @overrides def predict_json(self, inputs: JsonDict) -> JsonDict: instance = self._json_to_instance(inputs) result = self.predict_instance(instance) sentence = inputs['sentence'] tags = result['tags'] segment: List = [] seg_idx = -1 for idx, word, in enumerate(sentence): if tags[idx] == 'S' or tags[idx] == 'B': segment.append(word) seg_idx += 1 else: segment[seg_idx] += word result['segment'] = segment return result
def __init__(self, cache_path: Union[str, Path], audio_transformer: AudioTransformer = None, feature_name: str = 'mag', force: bool = False, lazy: bool = False) -> None: super(VCTK, self).__init__(lazy) self.cache_path = Path(cache_path) self.audio_transformer = audio_transformer or MelSpectrogram() self.feature_name = feature_name self.tokenizer = CharacterTokenizer() self.token_indexers = { 'character': SingleIdTokenIndexer(namespace='character') } if force and self.cache_path.exists(): self.cache_path.unlink()
def __init__( self, namespace: str = "token_characters", character_tokenizer: CharacterTokenizer = CharacterTokenizer(), start_tokens: List[str] = None, end_tokens: List[str] = None, min_padding_length: int = 0, token_min_padding_length: int = 0, ) -> None: super().__init__(token_min_padding_length) if min_padding_length == 0: url = "https://github.com/allenai/allennlp/issues/1954" warnings.warn( "You are using the default value (0) of `min_padding_length`, " f"which can cause some subtle bugs (more info see {url}). " "Strongly recommend to set a value, usually the maximum size " "of the convolutional layer size when using CnnEncoder.", UserWarning, ) self._min_padding_length = min_padding_length self._namespace = namespace self._character_tokenizer = character_tokenizer self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])]
def __init__(self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexer: Dict[str, TokenIndexer] = None, target_token_indexer: Dict[str, TokenIndexer] = None, max_tokens: int = None, target_add_start_token: bool = True, target_add_end_token: bool = True, end_symbol: str = END_SYMBOL, start_symbol: str = START_SYMBOL, **kwargs) -> None: super().__init__(**kwargs) self._source_tokenizer = source_tokenizer or CharacterTokenizer( start_tokens=[ START_SYMBOL, ], end_tokens=[ END_SYMBOL, ], lowercase_characters=True) self._target_tokenizer = target_tokenizer or WhitespaceTokenizer() self._source_token_indexer = source_token_indexer or { 'tokens': SingleIdTokenIndexer() } self._target_token_indexer = target_token_indexer or { 'tokens': SingleIdTokenIndexer() } self._max_tokens = max_tokens self._start_symbol = start_symbol self._end_symbol = end_symbol self._target_add_start_token = target_add_start_token self._target_add_end_token = target_add_end_token
def test_batch_tokenization(self): tokenizer = CharacterTokenizer() sentences = [ "This is a sentence", "This isn't a sentence.", "This is the 3rd sentence." "Here's the 'fourth' sentence." ] batch_tokenized = tokenizer.batch_tokenize(sentences) separately_tokenized = [ tokenizer.tokenize(sentence) for sentence in sentences ] assert len(batch_tokenized) == len(separately_tokenized) for batch_sentence, separate_sentence in zip(batch_tokenized, separately_tokenized): assert len(batch_sentence) == len(separate_sentence) for batch_word, separate_word in zip(batch_sentence, separate_sentence): assert batch_word.text == separate_word.text
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or CharacterTokenizer() self._token_indexers = token_indexers or { 'tokens': TokenCharactersIndexer() }
def __init__(self, tokenizer=None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.tokenizer = tokenizer or CharacterTokenizer() self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() }
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_token_len: int = 512): super().__init__(lazy) self._tokenizer = tokenizer or CharacterTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self._max_token_len = max_token_len
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or CharacterTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } relation_vocab_path = './raw_data/chinese/relation_vocab.json' self.relation_vocab = json.load(open(relation_vocab_path, 'r'))
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, max_tokens: int = None, **kwargs, ): super().__init__(**kwargs) self.tokenizer = tokenizer or CharacterTokenizer() self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.max_tokens = max_tokens
def __init__(self, lazy: bool = False, max_sequence_length: int = 256, tokenizer: Tokenizer = None, token_indexer: TokenIndexer = None): super().__init__(lazy) self._character_tokenizer = tokenizer or CharacterTokenizer() self._token_indexer = token_indexer or { 'tokens': SingleIdTokenIndexer() } self._max_sequence_length = max_sequence_length self._tags = ['B', 'M', 'E', 'S']
def construct_reader(is_pretrain): character_tokenizer = CharacterTokenizer(byte_encoding="utf-8", start_tokens=[259], end_tokens=[260]) token_character_indexer = TokenCharactersIndexer(character_tokenizer=character_tokenizer, min_padding_length=5) token_indexer = SingleIdTokenIndexer(lowercase_tokens=True) reader = FollowUpDataReader(token_indexer={ "token_words": token_indexer }, char_indexer={ "token_characters": token_character_indexer, }, is_pretrain=is_pretrain) return reader
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self._tok = tokenizer or CharacterTokenizer() if 'bert' in token_indexers: wordpiece_tok = token_indexers['bert'].wordpiece_tokenizer token_indexers['bert'].wordpiece_tokenizer = \ lambda s: ['[UNK]'] if s.isspace() else wordpiece_tok(s) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() }
def __init__( self, token_indexers: Dict[str, TokenIndexer], tokenizer: Tokenizer = CharacterTokenizer(), max_sequence_length: Optional[int] = None, lazy: bool = False, ) -> None: super().__init__(lazy) self.token_indexers = token_indexers self.tokenizer = tokenizer self._max_sequence_length = max_sequence_length or math.inf logger.info("Creating LevenshteinReader") logger.info("max_sequence_length=%s", max_sequence_length)
class NamesDatasetReader(DatasetReader): def __init__(self, lazy: bool = False) -> None: super().__init__(lazy) self._c_tokenizer = CharacterTokenizer() self._c_token_indexers = {"tokens": SingleIdTokenIndexer()} @overrides def text_to_instance(self, language: str, name: str) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_name = self._c_tokenizer.tokenize(name) name_field = TextField(tokenized_name, self._c_token_indexers) fields = {'name': name_field} if language is not None: print('language', language) fields['label'] = LabelField(language) return Instance(fields) @overrides def _read(self, file_path: str = './data/names/*.txt'): all_filenames = glob.glob('./data/names/*.txt') category_lines = {} all_categories = [] for filename in all_filenames: category = filename.split('/')[-1].split('.')[0] all_categories.append(category) lines = readLines(filename) category_lines[category] = lines def random_training_pair(): category = random.choice(all_categories) line = random.choice(category_lines[category]) return category, line length_dict = sum(len(value) for key, value in category_lines.items()) for i in range(len(category_lines)): language, name = random_training_pair() yield self.text_to_instance(language, name)
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, padding_length: int = None, char_level: bool = False) -> None: super().__init__(lazy=False) self._padding_length = padding_length self._tokenizer = tokenizer or (CharacterTokenizer() if char_level else WordTokenizer()) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer( token_min_padding_length=(self._padding_length or 0)) }
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, max_tokens: int = None, **kwargs) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or CharacterTokenizer(start_tokens=[ START_SYMBOL, ], end_tokens=[ END_SYMBOL, ]) self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } self._max_tokens = max_tokens
def from_params(cls, params: Params) -> 'TokenCharactersIndexer': """ Parameters ---------- namespace : ``str``, optional (default=``token_characters``) We will use this namespace in the :class:`Vocabulary` to map the characters in each token to indices. character_tokenizer : ``Params``, optional (default=``Params({})``) We use a :class:`CharacterTokenizer` to handle splitting tokens into characters, as it has options for byte encoding and other things. These parameters get passed to the character tokenizer. The default is to use unicode characters and to retain casing. """ namespace = params.pop('namespace', 'token_characters') character_tokenizer_params = params.pop('character_tokenizer', {}) character_tokenizer = CharacterTokenizer.from_params(character_tokenizer_params) params.assert_empty(cls.__name__) return cls(namespace=namespace, character_tokenizer=character_tokenizer)
def __init__(self, tokenizer: Tokenizer, token_indexers: Dict[str, TokenIndexer], combine_input_fields: Optional[bool] = False, add_special_symbols: Optional[bool] = False, **kwargs) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or CharacterTokenizer( lowercase_characters=True) self._token_indexer = token_indexers or { 'tokens': SingleIdTokenIndexer() } self._combine_input_fields = combine_input_fields self._add_special_symbols = add_special_symbols self._start_symbol = START_SYMBOL self._end_symbol = END_SYMBOL
def construct_reader(): from data_reader.dialogue_reader import FollowUpDataReader character_tokenizer = CharacterTokenizer(byte_encoding="utf-8", start_tokens=[259], end_tokens=[260]) token_character_indexer = TokenCharactersIndexer( character_tokenizer=character_tokenizer, min_padding_length=5) token_indexer = SingleIdTokenIndexer(lowercase_tokens=True) reader = FollowUpDataReader( token_indexer={ # "elmo": elmo_indexer, "token_words": token_indexer }, char_indexer={ "token_characters": token_character_indexer, }, is_pretrain=True) return reader
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save') vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
def prepare_dataset_reader(args): assert args.transformer is not None token_indexers = { 'transformer': TransformerIndexer( model_name=args.transformer, use_starting_offsets=True, truncate_long_sequences=True, max_pieces=510, do_lowercase="uncased" in args.transformer, ) } tokenizer = CharacterTokenizer() if args.character else None dataset_reader = TextClassificationReader( tokenizer=tokenizer, token_indexers=token_indexers, max_length=args.max_length, multi_label=args.classification_type == 'bce', ) return dataset_reader