Esempio n. 1
0
    def test_count_vocab_items_case_insensitive(self):
        indexer = SingleIdTokenIndexer("words", lowercase_tokens=True)
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)

        assert counter["words"] == {"hello": 2}
Esempio n. 2
0
    def test_count_vocab_items_respect_casing(self):
        indexer = SingleIdTokenIndexer("words")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)

        assert counter["words"] == {"hello": 1, "Hello": 1}
    def __init__(self,
                 namespace='tokens',
                 lowercase_tokens: bool = False,
                 start_tokens: List[str] = None,
                 end_tokens: List[str] = None,
                 token_min_padding_length: int = 0):
        super(SingleIdTokenIndexer, self).__init__(token_min_padding_length)
        self.namespace = namespace
        self.lowercase_tokens = lowercase_tokens

        self.start_tokens = [Token(t) for t in (start_tokens or [])]
        self.end_tokens = [Token(t) for t in (end_tokens or [])]
Esempio n. 4
0
    def __init__(
            self,
            namespace: str = "token_characters",
            character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
            start_tokens: List[str] = [],
            end_tokens: List[str] = [],
            min_padding_length: int = 0,
            token_min_padding_length: int = 0) -> None:
        # Call super class
        super(TokenCharacterIndexer, self).__init__(token_min_padding_length)

        self.min_padding_length = min_padding_length
        self.namespace = namespace
        self.character_tokenizer = character_tokenizer
        self.start_tokens = [Token(t) for t in start_tokens]
        self.end_tokens = [Token(t) for t in end_tokens]
Esempio n. 5
0
    def text_to_instance(self,
                         source_string: str,
                         target_string: str = None) -> Instance:
        tokenized_source = self.source_tokenizer.tokenize(source_string)
        if self.source_add_start_token:
            tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))

        source_field = TextField(tokenized_source, self.source_token_indexers)

        if target_string is not None:
            tokenized_target = self.target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self.target_token_indexers)

            return Instance({
                "source_tokens": source_field,
                "target_tokens": target_field
            })
        else:
            return Instance({"source_tokens": source_field})
Esempio n. 6
0
  def split_words(self, sentence: str) -> List[Token]:
    tokens = re.findall(r"(?:\d+,\d+)|(?:[\w'\u0080-\u9999]+(?:[-]+[\w'\u0080-\u9999]+)+)|(?:[\w\u0080-\u9999]+(?:[']+[\w\u0080-\u9999]+)+)|\b[_]|(?:[_]*[\w\u0080-\u9999]+(?=_\b))|(?:[\w\u0080-\u9999]+)|[^\w\s\p{Z}]", sentence, re.UNICODE)

    return [Token(t) for t in tokens]
Esempio n. 7
0
 def split_words(self, sentence: str) -> List[Token]:
   return [Token(t) for t in sentence.split()]