コード例 #1
0
ファイル: utils.py プロジェクト: ktrw1011/nlp-100knock-2020
def analysis_bunsetu(doc):
    """ginzaの係り受け解析結果を受け取り、文節クラスのリスト(Chunk)を返す
    """
    chunks = []
    # 文節Head番号=(形態素解析した時のトークンindex)を取得
    bunsetu_head_list = ginza.bunsetu_head_list(doc)

    # 文節に分解
    for i, bunsetu in enumerate(ginza.bunsetu_spans(doc)):
        # 文節を表すChunkクラスをインスタンス化
        chunk = Chunk()
        # 文節を形態素解析
        morphs = get_morphs(bunsetu)
        # chunkのメンバ変数にappend
        chunk.morphs = morphs
        chunks.append(chunk)

        # 係り受け解析
        for token in bunsetu.lefts:
            # 文節トークンが係る元の文節indexを取得
            chunk_idx = get_bunsetu_head_list_index(bunsetu_head_list, token.i)
            chunks[chunk_idx].dst = i

            chunks[i].srcs.append(chunk_idx)

    return chunks
コード例 #2
0
def main():
    st.set_page_config(layout="wide", initial_sidebar_state="expanded")
    st.title("GiNZA NLP Library")
    toc = Toc()
    toc.placeholder(True)

    input_list = st.text_area("入力文字列",
                              '銀座でランチをご一緒しましょう。今度の日曜日はどうですか。\n吾輩は猫である。 名前はまだ無い。 ').splitlines()
    ignore_lf = st.checkbox("改行を無視して1回で解析する。", False)
    if not st.button("実行"):
        st.stop()
        return
    if ignore_lf:
        input_list = ["".join(input_list)]
    with st.spinner(f'Wait for it...'):
        nlp = spacy.load('ja_ginza')
        # time.sleep(1.0)
        for i, input_str in enumerate(input_list):
            doc = nlp(input_str)
            for j, sent in enumerate(doc.sents):
                toc.subheader(f"{i + 1}-{j + 1}. {sent}")
                svg2 = spacy.displacy.render(create_manual(sent), style="dep",
                                             options={"compact": True, "offset_x": 200, "distance": 175}, manual=True)
                st.image(svg2, width=(len(sent) + 1) * 120)
                df = pd.DataFrame(index=[],
                                  columns=["i(index)", "orth(テキスト)", "lemma(基本形)", "reading_form(読みカナ)",
                                           "pos(PartOfSpeech)", "pos(品詞)", "tag(品詞詳細)", "inflection(活用情報)",
                                           "ent_type(エンティティ型)",
                                           "ent_iob(エンティティIOB)", "lang(言語)", "dep(dependency)", "dep(構文従属関係)",
                                           "head.i(親index)", "bunsetu_bi_label",
                                           "bunsetu_position_type", "is_bunsetu_head", "ent_label_ontonotes",
                                           "ent_label_ene"])
                for token in sent:
                    row = pd.DataFrame([token.i, token.orth_, token.lemma_, ginza.reading_form(token), token.pos_,
                                        DICT_POS_JP.get(token.pos_, token.pos_), token.tag_,
                                        ginza.inflection(token) or "-", token.ent_type_ or "-", token.ent_iob_ or "-",
                                        token.lang_, token.dep_,
                                        DICT_DEP_JP.get(token.dep_, token.dep_),
                                        token.head.i,
                                        ginza.bunsetu_bi_label(token), ginza.bunsetu_position_type(token),
                                        ginza.is_bunsetu_head(token),
                                        ginza.ent_label_ontonotes(token) or "-", ginza.ent_label_ene(token) or "-",
                                        ], index=df.columns).T
                    df = df.append(row, ignore_index=True)
                st.table(df.T)
                st.subheader("文節区切り")
                bunsetu_list = ginza.bunsetu_spans(sent)
                st.text("/".join([bunsetu.orth_ for bunsetu in bunsetu_list]))
                st.subheader("文節の主辞区間と句の区分")
                st.text("/".join([f"{phrase}({phrase.label_})" for phrase in ginza.bunsetu_phrase_spans(sent)]))
                st.subheader("固有表現(エンティティ)")
                if sent.ents:
                    svg_ent = spacy.displacy.render(sent, style="ent")
                    stc.html(svg_ent)
                else:
                    st.text("No Entity")
                toc.generate()
    toc.generate()
コード例 #3
0
            idf = 1 / corpus_count.get(mrph.midasi, 10)
            tfidf += (tf * idf)
    candidates.append([s, tfidf])

candidates = sorted(candidates, key=lambda x: x[1])
cand_sentences = list(map(lambda x: x[0], candidates))
cand_sentences = cand_sentences[-summary_count:]
cand_sentences.append(
    title_similar_sentence(test_data["title"], test_data["body"]))

summary_list = []

for s in cand_sentences[-summary_count:]:
    doc = nlp(s)
    summary = ""
    for sent in doc.sents:
        for t in bunsetu_spans(sent):
            for b in bunsetu(t.root, join_func=lambda tokens: tokens):
                if b.dep_ in [
                        "nsubj", "obj", "ROOT", "acl", "nmod", "compound",
                        "nummod"
                ]:
                    summary += b.lemma_
    summary_list.append(summary)

for i, s in enumerate(summary_list):
    if i < summary_count - 1:
        print(str(i) + ". " + s)
    else:
        print("タイトルに最も一致する一文 : " + s)
コード例 #4
0
ファイル: 41.py プロジェクト: ktrw1011/nlp-100knock-2020
with open('./ai.ja.txt') as f:
    for line in f:
        line = line.strip()
        if line == "":
            continue

        chunks = []

        # 解析
        doc = nlp(line)

        # 文節Head番号=(形態素解析した時のトークンindex)を取得
        bunsetu_head_list = ginza.bunsetu_head_list(doc)

        # 文節に分解
        for i, bunsetu in enumerate(ginza.bunsetu_spans(doc)):
            # 文節を表すChunkクラスをインスタンス化
            chunk = Chunk()
            # 文節を形態素解析
            morphs = get_morphs(bunsetu)
            # chunkのメンバ変数にappend
            chunk.morphs = morphs
            chunks.append(chunk)

            # 係り受け解析
            for token in bunsetu.lefts:
                # 文節トークンが係る元の文節indexを取得
                chunk_idx = get_bunsetu_head_list_index(
                    bunsetu_head_list, token.i)
                chunks[chunk_idx].dst = i
コード例 #5
0
 def _ginza_bunsetu(self, sentence):
     return [(chunk.text, chunk.label_)
             for chunk in ginza.bunsetu_spans(self.nlp(sentence))]
コード例 #6
0
#When faced with several errors, upon running, I had to update pip and install python3-devel via yum (CentOS 7)
#Most of code from sample at: https://www.megagon.ai/jp/blog/ginza-version-4-0/
import spacy
import ginza
nlp = spacy.load("ja_ginza")
doc = nlp("東京オリンピックは2021年に開催されています。")
print(ginza.bunsetu_spans(doc))
print("===============")
for np in doc.noun_chunks:
    print(np)