def tokenize(self, text: str) -> List[Token]: tokens = [] if self._with_postag: response = self._tokenizer.getTagsToString(text) # FIXME Following dirty workaround is required to # process inputs which include <whitespace> itself # (e.g. "私 は猫") response = response.replace("\\ ", "<SPACE>").replace(" ", " <SPACE>") for elem in response.split(" ")[:-1]: # FIXME If input contains a character "/", # KyTea outputs "//補助記号/・", # which breaks the simple logic elem.split("/") pron, postag, surface = map(lambda e: e[::-1], elem[::-1].split("/", maxsplit=2)) surface = surface.replace("<SPACE>", " ") tokens.append(Token(surface=surface, postag=postag, pron=pron)) else: for surface in list(self._tokenizer.getWS(text)): tokens.append(Token(surface=surface)) return tokens
def tokenize(self, text: str) -> List[Token]: return_result = [] parse_result = self._tokenizer.tokenize(text) if self._with_postag: for morph in parse_result: surface = morph.surface postag, postag2, postag3, postag4 = morph.part_of_speech.split( ",") inflection = morph.infl_type conjugation = morph.infl_form base_form = morph.base_form yomi = morph.reading pron = morph.phonetic token = Token( surface=surface, postag=postag, postag2=postag2, postag3=postag3, postag4=postag4, inflection=inflection, conjugation=conjugation, base_form=base_form, yomi=yomi, pron=pron, ) return_result.append(token) else: for morph in parse_result: return_result.append(Token(surface=morph.surface)) return return_result
def tokenize(self, text: str) -> List[Token]: """Tokenize.""" result = [] for token in self._tokenizer.tokenize(text, self._mode): surface = token.surface() if self._with_postag: ( postag, postag2, postag3, postag4, inflection, conjugation, ) = token.part_of_speech() base_form = token.dictionary_form() normalized_form = token.normalized_form() yomi = token.reading_form() result.append( Token( surface=surface, postag=postag, postag2=postag2, postag3=postag3, postag4=postag4, inflection=inflection, conjugation=conjugation, base_form=base_form, normalized_form=normalized_form, yomi=yomi, )) else: result.append(Token(surface=surface)) return result
def tokenize(self, text: str) -> List[Token]: response = self._tokenizer.tagging(text) tokens = [ Token(surface=surface, postag=postag) for (surface, postag) in zip(response.words, response.postags) ] return tokens
def parse_feature_for_unidic(elem) -> Token: """ UniDic format: https://unidic.ninjal.ac.jp/faq """ surface, feaature_line = elem.split("\t") features = feaature_line.split(",") postag = features[0] or None postag2 = features[1] or None postag3 = features[2] or None postag4 = features[3] or None inflection = features[4] or None conjugation = features[5] or None if len(features) >= 10: yomi = features[6] or None base_form = features[7] or None pron = features[9] or None else: yomi = None base_form = None pron = None return Token( surface=surface, postag=postag, postag2=postag2, postag3=postag3, postag4=postag4, inflection=inflection, conjugation=conjugation, base_form=base_form, yomi=yomi, pron=pron, )
def parse_feature_for_ipadic(elem) -> Token: surface, feature = elem.split("\t") ( postag, postag2, postag3, postag4, inflection, conjugation, base_form, *other, ) = feature.split(",") # For words not in a dictionary if len(other) == 2: yomi, pron = other else: yomi, pron = None, None return Token( surface=surface, postag=postag, postag2=postag2, postag3=postag3, postag4=postag4, inflection=inflection, conjugation=conjugation, base_form=base_form, yomi=yomi, pron=pron, )
def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict): tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def test_tokenize(raw_texts: List[str], tokenizer_params: Dict): tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [ Token.from_dict(token_param) for token_param in read_lines(tokenizer_name)[0] ] result = tokenizer.tokenize(raw_texts[0]) assert expect == result
def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict): if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7): pytest.skip("KyTea doesn't work in Python3.6") tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict): if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params[ "system_dictionary_path"].startswith("s3://"): pytest.skip("AWS credentials not found.") tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def tokenize(self, text: str) -> List[Token]: """Tokenize""" return_result = [] parse_result = self._tokenizer.parse(text).rstrip(" ") if self._with_postag: for elem in parse_result.split("\n")[:-1]: return_result.append(self._parse_feature(elem)) else: for surface in parse_result.split(" "): return_result.append(Token(surface=surface)) return return_result
def test_token_with_postag2(): token = Token( surface="大崎", postag="名詞", postag2="固有名詞,人名,姓", inflection="*", conjugation="*", base_form="大崎", yomi="オオサキ", pron="オーサキ", ) truth = "名詞,固有名詞,人名,姓,*,*,大崎,オオサキ,オーサキ" assert token.feature == truth
def tokenize(self, text: str) -> List[Token]: """Tokenize input text""" if isinstance(self._endpoint, str): endpoint = self.get_endpoint("/api/v1/tokenize") payload = dict(self.payload, text=text) token_params = self._tokenize_with_remote_host( endpoint=endpoint, payload=payload, headers=self.headers) return [ Token.from_dict(token_param) for token_param in token_params ] else: assert self._tokenizer is not None return self._tokenizer.tokenize(text)
def tokenize(self, text: str) -> List[Token]: result = [] for token in self._tokenizer.tokenize(text, self._mode): (postag, postag2, postag3, postag4, inflection, conjugation) = token.part_of_speech() result.append( Token( surface=token.surface(), postag=postag, postag2=postag2, postag3=postag3, postag4=postag4, inflection=inflection, conjugation=conjugation, base_form=token.dictionary_form(), normalized_form=token.normalized_form(), yomi=token.reading_form(), ) ) return result
def tokenize(self, text: str) -> List[Token]: return_result = [] for morph in self._tokenizer.tokenize(text): postag, postag2, postag3, postag4 = morph.part_of_speech.split(",") token = Token( surface=morph.surface, postag=postag, postag2=postag2, postag3=postag3, postag4=postag4, inflection=morph.infl_type, conjugation=morph.infl_form, base_form=morph.base_form, yomi=morph.reading, pron=morph.phonetic, ) return_result.append(token) return return_result
def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: """Tokenize input texts""" if isinstance(self._endpoint, str): endpoint = self.get_endpoint("/api/v1/batch_tokenize") payload = dict(self.payload, texts=texts) token_params_list = self._batch_tokenize_with_remote_host( endpoint=endpoint, payload=payload, headers=self.headers, ) tokens_list: List[List[Token]] = [] for tokens in token_params_list: tokens_list.append( [Token.from_dict(token) for token in tokens]) return tokens_list else: assert self._tokenizer is not None return [self._tokenizer.tokenize(text) for text in texts]
def test_token_with_postag(): token = Token(surface="大崎", postag="名詞") assert "大崎" == token.surface assert "名詞" == token.feature
def tokenize(self, text: str) -> List[Token]: result = [] for subword in self._tokenizer.EncodeAsPieces(text): token = Token(surface=subword) result.append(token) return result
def tokenize(self, text: str) -> List[Token]: return [Token(surface=surface) for surface in text.split(" ")]
def tokenize(_, text: str): return [Token(surface=char) for char in list(text)]
def test_token_without_feature(): token = Token(surface="大崎") assert "大崎" == token.surface assert "" == token.feature