def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
    nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
    nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
    nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})

    assert len(ja_tokenizer(text)) == len_a
    assert len(nlp_a(text)) == len_a
    assert len(nlp_b(text)) == len_b
    assert len(nlp_c(text)) == len_c
def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a,
                                 sub_tokens_list_b, sub_tokens_list_c):
    nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
    nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
    nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})

    assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
    assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
    assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
    assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
Esempio n. 3
0
def test_ja_tokenizer_sub_tokens(
    ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
):
    nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
    nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
    nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})

    assert ja_tokenizer(text).user_data.get("sub_tokens") is None
    assert nlp_a(text).user_data.get("sub_tokens") is None
    assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
    assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
Esempio n. 4
0
def main():
    text = "私は機能性食品を購入した。"
    a_nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
    b_nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
    c_nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})

    print('mode: A:')
    show(a_nlp, text)

    print('mode: B:')
    show(b_nlp, text)

    print('mode: C:')
    show(c_nlp, text)

    print('DONE')
Esempio n. 5
0
def test_issue2901():
    """Test that `nlp` doesn't fail."""
    try:
        nlp = Japanese()
    except ImportError:
        pytest.skip()

    doc = nlp("pythonが大好きです")
    assert doc
Esempio n. 6
0
def main(
    input_path='-',
    retokenize=False,
    extend_dep_labels=False,
    paragraph_id_regex=r'^(.*)[\-:][^\-:]*$',
    n_sents=10,
    augmentation=False,
    ensure_end_period=False,
    luw_ent=False,
):
    if luw_ent:
        assert not retokenize, "retokenize option must be disabled if use luw_ent option"

    if retokenize:
        nlp = Japanese()
        tokenizer = JapaneseTokenizer(nlp=nlp, split_mode=retokenize)
    else:
        tokenizer = None
    out = sys.stdout
    docs = convert_files(
        input_path,
        tokenizer,
        paragraph_id_regex,
        n_sents,
        extend_dep_labels,
        ensure_end_period,
        luw_ent,
    )
    if augmentation:
        random.seed(1)
        docs = [{
            'id':
            doc['id'],
            'paragraphs':
            sum([char_augmentation(p) for p in doc['paragraphs']], []),
        } for doc in docs]
    print_json(docs, out)
Esempio n. 7
0
from spacy.lang.ja import Japanese
from spacy.tokens import Span

nlp = Japanese()


# メソッドを定義
def to_html(span, tag):
    # スパンのテキストをHTMLタグに入れて返す
    return f"<{tag}>{span.text}</{tag}>"


# to_htmlをスパンの「to_html」拡張属性に登録
Span.set_extension("to_html", method=to_html)

# テキストを処理し、「strong」タグを用いてスパンのto_htmlメソッドを呼びだす
doc = nlp("おはようございます、 これは文章です。")
span = doc[0:3]
print(span._.to_html("strong"))
Esempio n. 8
0
import json
from spacy.matcher import Matcher
from spacy.lang.ja import Japanese

with open("exercises/ja/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = Japanese()
matcher = Matcher(nlp.vocab)

# 小文字が"iphone"と"x"にマッチする2つのトークン
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# 小文字が"iphone"と数字にマッチする2つのトークン
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]

# パターンをmatcherに追加して、結果をチェックする
matcher.add("GADGET", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])
Esempio n. 9
0
import json
from spacy.lang.ja import Japanese
from spacy.tokens import Doc

with open("exercises/ja/bookquotes.json") as f:
    DATA = json.loads(f.read())

nlp = Japanese()

# デフォルト値がNoneのDoc拡張属性「author」を登録
Doc.set_extension("author", default=None)

# デフォルト値がNoneのDoc拡張属性「book」を登録
Doc.set_extension("book", default=None)

for doc, context in nlp.pipe(DATA, as_tuples=True):
    # doc._.bookとdoc._.author属性にコンテキストからデータをセット
    doc._.book = context["book"]
    doc._.author = context["author"]

    # テキストとカスタム属性を表示
    print(f"{doc.text}\n — '{doc._.book}' by {doc._.author}\n")
Esempio n. 10
0
from spacy.lang.eu import Basque

from DataHandler import load_df_twitter_sent, load_df_lorelei
from util import clean_str as test_clean_str
from nltk.corpus import stopwords
from util import identity_fn, lang2id

language_dict = {
    'english': English(),
    'spanish': Spanish(),
    'french': French(),
    'italian': Italian(),
    'german': German(),
    'russian': Russian(),
    'chinese': Chinese(),
    'japanese': Japanese(),
    'catalan': Catalan(),
    'basque': Basque(),
}


class Tokenizer:
    def __init__(self,
                 language,
                 tokenizer_method='spacy',
                 remove_stopwords=True,
                 lowercase=True,
                 strip_accents=None,
                 ngram_range=(1, 1),
                 min_freq=1,
                 max_freq_perc=1.0,
Esempio n. 11
0
import json
from spacy.lang.ja import Japanese

with open("exercises/ja/countries.json") as f:
    COUNTRIES = json.loads(f.read())

nlp = Japanese()
doc = nlp("チェコ共和国はスロバキアの領空保護に協力する可能性がある")

# PhraseMatcherをインポートして初期化
from spacy.____ import ____

matcher = ____(____)

# パターンを表すDocオブジェクトを作成し、matcherに追加
# これは[nlp(country) for country in COUNTRIES]の高速バージョンです
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# サンプルのdocにmatcherを適用し、結果をプリントします
matches = ____(____)
print([doc[start:end] for match_id, start, end in matches])
Esempio n. 12
0
def test_ja_morphologizer_factory():
    pytest.importorskip("sudachipy")
    nlp = Japanese()
    morphologizer = nlp.add_pipe("morphologizer")
    assert morphologizer.cfg["extend"] is True
Esempio n. 13
0
import json
from spacy.lang.ja import Japanese
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/ja/countries.json") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/ja/capitals.json") as f:
    CAPITALS = json.loads(f.read())

nlp = Japanese()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # すべてのマッチ結果に対して、「GPE」ラベルが付いたスパンを作成しましょう
    matches = matcher(doc)
    doc.ents = [
        ____(____, ____, ____, label=____) for match_id, start, end in matches
    ]
    return doc


# パイプラインにコンポーネントを追加しましょう
____.____(____)
print(nlp.pipe_names)

# 国の首都名が入った辞書をスパンのテキストで引くゲッター
get_capital = lambda span: CAPITALS.get(span.text)
def test_ja_tokenizer_serialize(ja_tokenizer):
    tokenizer_bytes = ja_tokenizer.to_bytes()
    nlp = Japanese()
    nlp.tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
    assert nlp.tokenizer.split_mode is None

    with make_tempdir() as d:
        file_path = d / "tokenizer"
        ja_tokenizer.to_disk(file_path)
        nlp = Japanese()
        nlp.tokenizer.from_disk(file_path)
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
        assert nlp.tokenizer.split_mode is None

    # split mode is (de)serialized correctly
    nlp = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
    nlp_r = Japanese()
    nlp_bytes = nlp.to_bytes()
    nlp_r.from_bytes(nlp_bytes)
    assert nlp_bytes == nlp_r.to_bytes()
    assert nlp_r.tokenizer.split_mode == "B"

    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp_r = Japanese()
        nlp_r.from_disk(d)
        assert nlp_bytes == nlp_r.to_bytes()
        assert nlp_r.tokenizer.split_mode == "B"
from spacy.lang.ja import Japanese

nlp = Japanese()

people = ["デヴィッド・ボウイ", "アンゲラ・メルケル", "レディー・ガガ"]

# PhraseMatcherのパターンのリストを作成
patterns = list(nlp.pipe(people))
Esempio n. 16
0
import json
from spacy.lang.ja import Japanese
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/ja/countries.json") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/ja/capitals.json") as f:
    CAPITALS = json.loads(f.read())

nlp = Japanese()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # すべてのマッチ結果に対して、「GPE」ラベルが付いたスパンを作成しましょう
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="GPE") for match_id, start, end in matches
    ]
    return doc


# パイプラインにコンポーネントを追加しましょう
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# 国の首都名が入った辞書をスパンのテキストで引くゲッター
get_capital = lambda span: CAPITALS.get(span.text)