def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}}) nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}}) nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}}) assert len(ja_tokenizer(text)) == len_a assert len(nlp_a(text)) == len_a assert len(nlp_b(text)) == len_b assert len(nlp_c(text)) == len_c
def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c): nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
def test_ja_tokenizer_sub_tokens( ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c ): nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}}) nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}}) nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}}) assert ja_tokenizer(text).user_data.get("sub_tokens") is None assert nlp_a(text).user_data.get("sub_tokens") is None assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
def main(): text = "私は機能性食品を購入した。" a_nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) b_nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) c_nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) print('mode: A:') show(a_nlp, text) print('mode: B:') show(b_nlp, text) print('mode: C:') show(c_nlp, text) print('DONE')
def test_issue2901(): """Test that `nlp` doesn't fail.""" try: nlp = Japanese() except ImportError: pytest.skip() doc = nlp("pythonが大好きです") assert doc
def main( input_path='-', retokenize=False, extend_dep_labels=False, paragraph_id_regex=r'^(.*)[\-:][^\-:]*$', n_sents=10, augmentation=False, ensure_end_period=False, luw_ent=False, ): if luw_ent: assert not retokenize, "retokenize option must be disabled if use luw_ent option" if retokenize: nlp = Japanese() tokenizer = JapaneseTokenizer(nlp=nlp, split_mode=retokenize) else: tokenizer = None out = sys.stdout docs = convert_files( input_path, tokenizer, paragraph_id_regex, n_sents, extend_dep_labels, ensure_end_period, luw_ent, ) if augmentation: random.seed(1) docs = [{ 'id': doc['id'], 'paragraphs': sum([char_augmentation(p) for p in doc['paragraphs']], []), } for doc in docs] print_json(docs, out)
from spacy.lang.ja import Japanese from spacy.tokens import Span nlp = Japanese() # メソッドを定義 def to_html(span, tag): # スパンのテキストをHTMLタグに入れて返す return f"<{tag}>{span.text}</{tag}>" # to_htmlをスパンの「to_html」拡張属性に登録 Span.set_extension("to_html", method=to_html) # テキストを処理し、「strong」タグを用いてスパンのto_htmlメソッドを呼びだす doc = nlp("おはようございます、 これは文章です。") span = doc[0:3] print(span._.to_html("strong"))
import json from spacy.matcher import Matcher from spacy.lang.ja import Japanese with open("exercises/ja/iphone.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = Japanese() matcher = Matcher(nlp.vocab) # 小文字が"iphone"と"x"にマッチする2つのトークン pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}] # 小文字が"iphone"と数字にマッチする2つのトークン pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}] # パターンをmatcherに追加して、結果をチェックする matcher.add("GADGET", None, pattern1, pattern2) for doc in nlp.pipe(TEXTS): print([doc[start:end] for match_id, start, end in matcher(doc)])
import json from spacy.lang.ja import Japanese from spacy.tokens import Doc with open("exercises/ja/bookquotes.json") as f: DATA = json.loads(f.read()) nlp = Japanese() # デフォルト値がNoneのDoc拡張属性「author」を登録 Doc.set_extension("author", default=None) # デフォルト値がNoneのDoc拡張属性「book」を登録 Doc.set_extension("book", default=None) for doc, context in nlp.pipe(DATA, as_tuples=True): # doc._.bookとdoc._.author属性にコンテキストからデータをセット doc._.book = context["book"] doc._.author = context["author"] # テキストとカスタム属性を表示 print(f"{doc.text}\n — '{doc._.book}' by {doc._.author}\n")
from spacy.lang.eu import Basque from DataHandler import load_df_twitter_sent, load_df_lorelei from util import clean_str as test_clean_str from nltk.corpus import stopwords from util import identity_fn, lang2id language_dict = { 'english': English(), 'spanish': Spanish(), 'french': French(), 'italian': Italian(), 'german': German(), 'russian': Russian(), 'chinese': Chinese(), 'japanese': Japanese(), 'catalan': Catalan(), 'basque': Basque(), } class Tokenizer: def __init__(self, language, tokenizer_method='spacy', remove_stopwords=True, lowercase=True, strip_accents=None, ngram_range=(1, 1), min_freq=1, max_freq_perc=1.0,
import json from spacy.lang.ja import Japanese with open("exercises/ja/countries.json") as f: COUNTRIES = json.loads(f.read()) nlp = Japanese() doc = nlp("チェコ共和国はスロバキアの領空保護に協力する可能性がある") # PhraseMatcherをインポートして初期化 from spacy.____ import ____ matcher = ____(____) # パターンを表すDocオブジェクトを作成し、matcherに追加 # これは[nlp(country) for country in COUNTRIES]の高速バージョンです patterns = list(nlp.pipe(COUNTRIES)) matcher.add("COUNTRY", None, *patterns) # サンプルのdocにmatcherを適用し、結果をプリントします matches = ____(____) print([doc[start:end] for match_id, start, end in matches])
def test_ja_morphologizer_factory(): pytest.importorskip("sudachipy") nlp = Japanese() morphologizer = nlp.add_pipe("morphologizer") assert morphologizer.cfg["extend"] is True
import json from spacy.lang.ja import Japanese from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/ja/countries.json") as f: COUNTRIES = json.loads(f.read()) with open("exercises/ja/capitals.json") as f: CAPITALS = json.loads(f.read()) nlp = Japanese() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # すべてのマッチ結果に対して、「GPE」ラベルが付いたスパンを作成しましょう matches = matcher(doc) doc.ents = [ ____(____, ____, ____, label=____) for match_id, start, end in matches ] return doc # パイプラインにコンポーネントを追加しましょう ____.____(____) print(nlp.pipe_names) # 国の首都名が入った辞書をスパンのテキストで引くゲッター get_capital = lambda span: CAPITALS.get(span.text)
def test_ja_tokenizer_serialize(ja_tokenizer): tokenizer_bytes = ja_tokenizer.to_bytes() nlp = Japanese() nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert nlp.tokenizer.split_mode is None with make_tempdir() as d: file_path = d / "tokenizer" ja_tokenizer.to_disk(file_path) nlp = Japanese() nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert nlp.tokenizer.split_mode is None # split mode is (de)serialized correctly nlp = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}}) nlp_r = Japanese() nlp_bytes = nlp.to_bytes() nlp_r.from_bytes(nlp_bytes) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.split_mode == "B" with make_tempdir() as d: nlp.to_disk(d) nlp_r = Japanese() nlp_r.from_disk(d) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.split_mode == "B"
from spacy.lang.ja import Japanese nlp = Japanese() people = ["デヴィッド・ボウイ", "アンゲラ・メルケル", "レディー・ガガ"] # PhraseMatcherのパターンのリストを作成 patterns = list(nlp.pipe(people))
import json from spacy.lang.ja import Japanese from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/ja/countries.json") as f: COUNTRIES = json.loads(f.read()) with open("exercises/ja/capitals.json") as f: CAPITALS = json.loads(f.read()) nlp = Japanese() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # すべてのマッチ結果に対して、「GPE」ラベルが付いたスパンを作成しましょう matches = matcher(doc) doc.ents = [ Span(doc, start, end, label="GPE") for match_id, start, end in matches ] return doc # パイプラインにコンポーネントを追加しましょう nlp.add_pipe(countries_component) print(nlp.pipe_names) # 国の首都名が入った辞書をスパンのテキストで引くゲッター get_capital = lambda span: CAPITALS.get(span.text)