Python Aozora.read Examples

Programming Language: Python

Namespace/Package Name: aozora

Class/Type: Aozora

Method/Function: read

Examples at hotexamples.com: 8

Python Aozora.read - 8 examples found. These are the top rated real world Python examples of aozora.Aozora.read extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

read(8)

Aozora(7)

Frequently Used Methods

read (8)

Aozora (7)

Example #1

Show file

# -*- coding: utf-8 -*-
# リスト 3-3 『吾輩は猫である』を単語に分解し、単語数の分布のヒストグラム・箱ひげ図を描くプログラム
from aozora import Aozora
import re
import MeCab
import numpy as np
import matplotlib.pyplot as plt
aozora = Aozora("wagahaiwa_nekodearu.txt")

# 文に分解する
string = '\n'.join(aozora.read())
string = re.sub('　', '', string)
string = re.split('。(?!」)|\n', re.sub('　', '', string))
while '' in string:
    string.remove('')  # 空行を除く
m = MeCab.Tagger("-Ochasen")  # MeCabで品詞分解する

# 先頭20文について文単位で形態素解析し、名詞だけ抽出して、基本形を文ごとのリストにする
lengthlist = np.array([len(v) for v in string][3:23])
print('average', lengthlist.mean())
print('variance', lengthlist.var())
print('std-deviation', lengthlist.std())
for u in lengthlist:
    print(u)  # それぞれの文の長さを、出現順に表示
for u in sorted(lengthlist):
    print(u)  # それぞれの文の長さを、長さ順に表示

plt.rcParams['font.family'] = 'IPAGothic'
fig = plt.figure()
plt.title('文の長さ（文字数）')
plt.xlabel('長さ')

Example #2

Show file

# -*- coding: utf-8 -*-
# リスト 5-9 「吾輩」をキーワードにした KWIC 検索プログラム例
# NLTK Concordanceの情報は  http://www.nltk.org/api/nltk.html
from aozora import Aozora
import MeCab
import nltk

aozora = Aozora("wagahaiwa_nekodearu.txt")
m = MeCab.Tagger("-Owakati -b65535")  # MeCabのインスタンス生成（分かち書き）
string = m.parse('\n'.join(aozora.read()))  # 分かち書きに変換する
text = nltk.Text(nltk.word_tokenize(string))
# NLTKでトークン化しTextのフォーマットに変換する
word = '吾輩'  # 検索語
c = nltk.text.ConcordanceIndex(text)
# ConcordanceIndexクラスのインスタンス生成、入力textを指定
c.print_concordance(word, width=40)  # 検索語wordでKWIC形式を表示
print(c.offsets(word))  # 検索語wordの位置情報を得る

Example #3

Show file

File: list5-9.py Project: Kose-i/learning_NLP

from aozora import Aozora
import MeCab
import nltk

aozora = Aozora("neko_jyo.txt")
m = MeCab.Tagger("-Owakati -b65535")
string = m.parse('\n'.join(aozora.read()))
text = nltk.Text(nltk.word_tokenize(string))  # NLTKでトークン化し、Textのフォーマットに変換する
word = '吾輩'
c = nltk.text.ConcordanceIndex(text)  # ConcordanceIndexクラスのインスタンス生成、入力textを指定
c.print_concordance(word, width=40)  # 検索語wordでKWIC形式を表示
print(c.offsets(word))  # 検索語wordの位置情報を得る

Example #4

Show file

from aozora import Aozora
import MeCab
import nltk


def print_data(label, data):
    can_print = True
    if not can_print:
        return
    print('\n*** ', label)
    print(data)


aozora = Aozora('wagahaiwa_nekodearu.txt')
m = MeCab.Tagger('-Owakati -b65535')
all_text = m.parse('\n'.join(aozora.read()))
text = nltk.Text(nltk.word_tokenize(all_text))

word = '吾輩'
c = nltk.text.ConcordanceIndex(text)
c.print_concordance(word, width=40)
print_data('KWICの位置情報', c.offsets(word))

Example #5

Show file

# -*- coding: utf-8 -*-
# リスト 4-2 文書全体を単語に分解し、出現頻度を数えるプログラム例
from collections import Counter
from aozora import Aozora
import MeCab

aozora = Aozora("wagahaiwa_nekodearu.txt")
string = '\n'.join(aozora.read())  # 1つの文字列データにする

# 形態素解析して、語の出現頻度を数える
m = MeCab.Tagger("-Ochasen")  # MeCabで単語に分割する
mecablist = []
wlist = m.parse(string).splitlines()  # 結果を単語情報リストのリストに整形する
for u in wlist:
    xlist = []
    for v in u.split():
        xlist.append(v)
    mecablist.append(xlist)

# 得られた単語情報リストのリストから、単語の部分だけを取り出したリストを作る
wordbodylist = []
for u in mecablist:
    wordbodylist.append(u[0])
# 単語のリストで出現頻度を数える
cnt = Counter(wordbodylist)
# 頻度順に100個表示
print(sorted(cnt.items(), key=lambda x: x[1], reverse=True)[:100])

Example #6

Show file

# -*- coding: utf-8 -*-
# 4.1.1節 文字の出現頻度  青空文庫のテキストを取り込んで、文字の出現回数を数える
from collections import Counter
from aozora import Aozora
aozora = Aozora("wagahaiwa_nekodearu.txt")

# 文字ごとの出現頻度を調べる
string = '\n'.join(aozora.read())  # パラグラフをすべて結合して1つの文字列にする
cnt = Counter(string)
# 頻度順にソートして出力する
print(sorted(cnt.items(), key=lambda x: x[1], reverse=True)[:50])

Example #7

Show file

from aozora import Aozora

pndicfname = './sentiment/pn_ja.dic'
aozora = Aozora('./wagahaiwa_nekodearu.txt')


def read_pndic(filename):
    with open(filename, 'r') as dicfile:
        items = dicfile.read().splitlines()
    return {u.split(':')[0]: float(u.split(':')[3]) for u in items}


pndic = read_pndic(pndicfname)

# 文に分解する
text = '\n'.join(aozora.read())
text = re.sub(' ', '', text)
sentences = re.split('。(?!」)|\n', text)
# 空行を除く
while '' in sentences:
    sentences.remove('')
print('\n***** Sentences')
pprint(sentences)

# MeCabで品詞分解する
m = MeCab.Tagger('-Ochasen')

# 文単位で形態素解析し、名詞だけ抽出し、基本形を文ごとにリストにする
sentence_word_list = [[
    v.split()[2] for v in m.parse(sentence).splitlines()
    if (len(v.split()) >= 3 and v.split()[3][:2] in ['名詞', '形容', '動詞', '副詞'])

Example #8

Show file

print(string_e)
print(cnt)
print('Count of i: ' + str(cnt['i']) + '\n')

# 日本語
string_j = '吾輩は猫である。名前はまだ無い。'
cnt = Counter(string_j)
print(string_j)
print(cnt)
print('\n')

# 吾輩は猫である（夏目漱石＠青空文庫）
# https://www.aozora.gr.jp/cards/000148/card789.html#download
from aozora import Aozora
ao = Aozora('./wagahaiwa_nekodearu.txt')
string_waga = '\n'.join(ao.read())  # パラグラフをすべて結合して1つの文字列にする
cnt = Counter(string_waga)
# 頻度順（値）にソートして出力する
print(sorted(cnt.items(), key=lambda x: x[1], reverse=True)[:50])
print('\n')

# 英文の分割
# python -m nltk.downloader punkt で tokenizer をダウンロードする必要がある
# python -m nltk.downloader inaugural で inaugural コーパスをダウンロードする必要がある
#  (python -m nltk.downloader all なら全コーパスをダウンロードする)
import nltk
from nltk.corpus import inaugural
text = inaugural.raw('1789-Washington.txt')
sents = nltk.tokenize.sent_tokenize(text)
for u in sents:
    print('>>   ' + u + '<<')