def test_count_vocab_items_case_insensitive(self): indexer = SingleIdTokenIndexer("words", lowercase_tokens=True) counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["words"] == {"hello": 2}
def test_count_vocab_items_respect_casing(self): indexer = SingleIdTokenIndexer("words") counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["words"] == {"hello": 1, "Hello": 1}
def __init__(self, namespace='tokens', lowercase_tokens: bool = False, start_tokens: List[str] = None, end_tokens: List[str] = None, token_min_padding_length: int = 0): super(SingleIdTokenIndexer, self).__init__(token_min_padding_length) self.namespace = namespace self.lowercase_tokens = lowercase_tokens self.start_tokens = [Token(t) for t in (start_tokens or [])] self.end_tokens = [Token(t) for t in (end_tokens or [])]
def __init__( self, namespace: str = "token_characters", character_tokenizer: CharacterTokenizer = CharacterTokenizer(), start_tokens: List[str] = [], end_tokens: List[str] = [], min_padding_length: int = 0, token_min_padding_length: int = 0) -> None: # Call super class super(TokenCharacterIndexer, self).__init__(token_min_padding_length) self.min_padding_length = min_padding_length self.namespace = namespace self.character_tokenizer = character_tokenizer self.start_tokens = [Token(t) for t in start_tokens] self.end_tokens = [Token(t) for t in end_tokens]
def text_to_instance(self, source_string: str, target_string: str = None) -> Instance: tokenized_source = self.source_tokenizer.tokenize(source_string) if self.source_add_start_token: tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self.source_token_indexers) if target_string is not None: tokenized_target = self.target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self.target_token_indexers) return Instance({ "source_tokens": source_field, "target_tokens": target_field }) else: return Instance({"source_tokens": source_field})
def split_words(self, sentence: str) -> List[Token]: tokens = re.findall(r"(?:\d+,\d+)|(?:[\w'\u0080-\u9999]+(?:[-]+[\w'\u0080-\u9999]+)+)|(?:[\w\u0080-\u9999]+(?:[']+[\w\u0080-\u9999]+)+)|\b[_]|(?:[_]*[\w\u0080-\u9999]+(?=_\b))|(?:[\w\u0080-\u9999]+)|[^\w\s\p{Z}]", sentence, re.UNICODE) return [Token(t) for t in tokens]
def split_words(self, sentence: str) -> List[Token]: return [Token(t) for t in sentence.split()]