def write_file(filename, rng): category_cnter = defaultdict(int) with open(filename + ".txt", "w") as f: for i in rng: print(data[i]["CATEGORY"], data[i]["TITLE"], sep="\t", file=f) category_cnter[data[i]["CATEGORY"]] += 1 with Renderer(filename) as out: for tag, name in categories.items(): out.result(name, category_cnter[tag])
def build_cnter(query: dict, *, verbose=False) -> Counter[str]: [(tgt_key, tgt_val)] = query.items() cnter = Counter() for sentence in tqdm(mecab_into_sentences()): cnter += Counter(d[tgt_key] for d in sentence) if verbose: with Renderer(f"「{tgt_val}」の出現頻度") as out: out.header("上位 10 個") pprint.pprint(cnter.most_common(10), stream=sys.stderr) out.result("種類", len(cnter)) return cnter
def test_extract(query: dict, *, verbose=False) -> list: [(src_key, src_val)] = query["src"].items() [(dst_key, dst_val)] = query["dst"].items() res = [] for sentence in mecab_into_sentences(): res.extend([d[dst_key] for d in sentence if d[src_key] == src_val]) if verbose: with Renderer(f"「{src_val}」の「{dst_val}」") as out: out.result("数", len(res)) out.result("種類", len(set(res))) out.result("上から 10 個", res[:10]) return res
[Usage] python knock03.py """ import doctest import os import re import sys sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer, message # noqa: E402 isort:skip if __name__ == "__main__": doctest.testmod(verbose=True) s = ("Now I need a drink, alcoholic of course, " "after the heavy lectures involving quantum mechanics.") with Renderer("knock03") as out: out.result( "replace + list comprehension", [len(w) for w in s.replace(",", "").replace(".", "").split()], ) out.result( "rstrip + list comprehension", [len(w.rstrip(",.")) for w in s.split()], ) out.result("re.findall + map", list(map(len, re.findall(r"\w+", s)))) out.result("re.sub + map", [*map(len, re.sub(r"[,.]", "", s).split())]) out.result("re.split + map", [*map(len, re.split(r"\W+", s)[:-1])])
sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer, message # noqa: E402 isort:skip if __name__ == "__main__": w1 = "paraparaparadise" w2 = "paragraph" n = 2 X = set(n_gram(w1, n)) Y = set(n_gram(w2, n)) tgt = n_gram("se", n).pop() print("X =", X) print("Y =", Y) print("X ∪ Y = {}".format(X | Y)) # X.union(Y) print("X ∩ Y = {}".format(X & Y)) # X.intersection(Y) print("X \\ Y = {}".format(X - Y)) # X.difference(Y) print("Y \\ X = {}".format(Y - X)) # Y.difference(X) print(f"X includes 'se': {tgt in X}") print(f"Y includes 'se': {tgt in Y}") with Renderer("MEMO") as out: out.result(r"X ∪ Y", X.union(Y)) out.result(f"X ∩ Y", X.intersection(Y)) out.result(rf"X \ Y", X.difference(n_gram(w2, n))) out.result(fr"Y \ X", Y.difference(n_gram(w1, n)))
from typing import Iterator, Match, Tuple sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer, message # noqa: E402 isort:skip from kiyuna.utils.pickle import load # noqa: E402 isort:skip def exec_search(wiki: str, pattern: str) -> Iterator[Tuple[str, Match]]: reg = re.compile(pattern) for line in wiki.split("\n"): match = reg.search(line) if match: yield line, match if __name__ == "__main__": wiki = load("UK") pat_category_only = r"\[\[Category:(?P<Category_name>.+?)(\||])" for _, match in exec_search(wiki, pat_category_only): print(match.group("Category_name")) pats = ( pat_category_only, r"\[\[Category:(?P<Category_name>[^|]+)\|*(?P<Sortkey>.*)\]\]", ) for pat in pats: with Renderer(pat) as out: for line, match in exec_search(wiki, pat): out.result(line, match.groups())
def exec_findall(wiki: str, pattern: str) -> Iterator[Tuple[str, Group]]: reg = re.compile(pattern) for line in wiki.split("\n"): for match in reg.findall(line): yield line, match if __name__ == "__main__": wiki = load("UK") pat = ( r"(?:\s=\s)?" # 「基礎情報 国]」対策 r"([^:=]+)" # '/' を [^] の中に追加すると <ref> 内のファイル名も取得できる r"\.(?i)(png|gif|jpg|jpeg|xcf|pdf|mid|ogg|svg|djvu)") with Renderer("knock24") as out: for line, filename in exec_findall(wiki, pat): fname = ".".join(filename) if "/" not in fname: # <ref> 対策 out.result(trunc(line), green(fname)) """ NOTE - ウィキペディアの画像 - [[ファイル:Uk topo en.jpg|thumb|200px|イギリスの地形図]] - 基礎情報 国 - |国旗画像 = Flag of the United Kingdom.svg - <gallery> - Stonehenge2007 07 30.jpg|[[ストーンヘンジ]] - <ref> - <ref>[http://warp.da.ndl.go.jp/.../country.pdf """
[Ref] - reversed - https://docs.python.org/ja/3/library/functions.html#reversed - string は変更不能なシーケンス型 - https://docs.python.org/ja/3/reference/datamodel.html [Usage] python knock00.py """ import os import sys sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer, message # noqa: E402 isort:skip if __name__ == "__main__": s = "stressed" with Renderer("knock00") as out: out.result("slice", s[::-1]) out.result("reversed", "".join(reversed(s))) with Renderer("MEMO") as out: out.result("reversed の返り値は", reversed(s)) try: s.reverse() except AttributeError as e: out.result("string は変更不能なシーケンス型(immutable sequence)", e)
[MEMO] 2015 年版の knock94-95 に対応 """ import os import sys from zipfile import ZipFile from scipy.stats import spearmanr sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import message, Renderer # noqa: E402 isort:skip from kiyuna.utils.pickle import load # noqa: E402 isort:skip if __name__ == "__main__": wv = load("chap07-embeddings") preds, labels = [], [] with ZipFile("wordsim353.zip") as myzip: message(myzip.infolist()) with myzip.open("combined.csv") as myfile: myfile = map(lambda x: x.decode(), myfile) message("[header]", next(myfile)) # Word 1,Word 2,Human (mean) for line in myfile: word1, word2, human = line.split(",") preds.append(wv.similarity(word1, word2)) labels.append(human) with Renderer("knock66") as out: out.result("Spearman corr", spearmanr(preds, labels)[0]) """result 0.6849564489532376 """
return list(country_names) def country_embeddings(): wv = load("chap07-embeddings") country_names = np.array(list_country_names(), dtype=object) embeddings = [wv[country_name] for country_name in country_names] dump([embeddings, country_names], "chap07-embeddings-country") return embeddings, country_names if __name__ == "__main__": embeddings, country_names = country_embeddings() kmeans = KMeans(n_clusters=5).fit(embeddings) dump(kmeans, "chap07-kmeans") with Renderer("knock67", start=0) as out: for i in range(5): out.result(f"Class {i}", country_names[kmeans.labels_ == i]) """result [*] 0. Class 0 ['Bhutan' 'Bahrain' 'Japan' 'Morocco' 'Indonesia' 'Pakistan' 'Thailand' 'Tunisia' 'Oman' 'Egypt' 'Turkey' 'Qatar' 'Iraq' 'Laos' 'Libya' 'Lebanon' 'Jordan' 'Afghanistan' 'Bangladesh' 'Syria' 'Nepal' 'China' 'Vietnam' 'Iran'] [*] 1. Class 1 ['Samoa' 'Chile' 'Dominica' 'Australia' 'Ecuador' 'Fiji' 'Bahamas' 'Canada' 'Jamaica' 'Nicaragua' 'Cuba' 'Peru' 'Venezuela' 'Uruguay' 'Guyana' 'Honduras' 'Belize' 'Greenland' 'Philippines' 'Taiwan' 'Tuvalu' 'Suriname'] [*] 2. Class 2 ['Ghana' 'Malawi' 'Gabon' 'Gambia' 'Namibia' 'Guinea' 'Uganda' 'Somalia'
def build_word_frequency_cnter(path: str, trans: F = str) -> Counter[T]: with open(path) as f: return collections.Counter(map(trans, f.read().split())) def get_vocab(path: str, trans: F = str) -> Set[T]: with open(path) as f: return {trans(w) for w in f.read().split()} if __name__ == "__main__": path = sys.argv[1] if path == "MEMO": with Renderer("個人メモ") as out: out.result( "type hints", ( typing.get_type_hints(list_word_freq), build_word_frequency_cnter.__annotations__, ), ) out.header("with 内で return しても大丈夫なはず") dis.dis(build_word_frequency_cnter, file=sys.stderr) out.header("doctest") doctest.testmod(verbose=True) out.header("check serialize") cnter = list_word_freq("../../test/00-input.txt") dump(cnter, "cnter") cnter = load("cnter")
[Usage] python knock08.py """ import os import string import sys sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer, message # noqa: E402 isort:skip def rot13(text: str) -> str: return "".join(chr(219 - ord(c)) if c.islower() else c for c in text) if __name__ == "__main__": d = { "printable": string.printable.strip(), "ascii_lowercase": string.ascii_lowercase, } for title, s in d.items(): with Renderer(title) as out: out.result("plaintext", s) out.header("encode") out.result("ciphertext", rot13(s)) out.header("decode") out.result("plaintext", rot13(rot13(s)))
right = get_S_expr(best_edge[sym_ij][1], best_edge, words) return f"({sym} {left} {right})" else: return f"({sym} {words[int(i)]})" if __name__ == "__main__": if sys.argv[1] == "test": grammar_file = "../../test/08-grammar.txt" input_file = "../../test/08-input.txt" else: grammar_file = "../../data/wiki-en-test.grammar" input_file = "../../data/wiki-en-short.tok" s, t = 0, 1 with Renderer(sys.argv[1]) as out: for i, s_expr in enumerate(cky(grammar_file, input_file, s=s, t=t)): message("=" * 3, "line:", s + i, "=" * 3) tree = Tree.fromstring(s_expr) out.result("S-expression", s_expr) out.result("nltk.tree.Tree", tree) out.header("nltk.tree.Tree.pretty_print") tree.pretty_print() # tree.draw() """result [+] main === line: 0 === [*] 1. S-expression (S (PP (IN Among) (NP (DT these) (NP' (, ,) (NP' (JJ supervised) (NP' (NN learning) (NNS approaches)))))) (S' (VP (VBP have) (VP (VBN been) (VP' (NP (DT the) (NP' (ADJP (RBS most) (JJ successful)) (NNS algorithms))) (PP (TO to) (NP_NN date))))) (. .))) [*] 2. nltk.tree.Tree (S
def exec_match(wiki: str, pattern: str) -> Iterator[Tuple[str, Match]]: reg = re.compile(pattern) for line in wiki.split("\n"): match = reg.match(line) if match: yield line, match if __name__ == "__main__": wiki = load("UK") pat = r"(?P<Level>=+)\s*(?P<Heading>.+)\s*(?P=Level)" for _, match in exec_match(wiki, pat): level, heading = match.group(1, 2) print( " " * (len(level) - 2), "+", f" lv{len(level) - 1} ", heading, sep="", ) with Renderer("re.match() vs. re.search()") as out: pat_hat = r"^" + pat it = zip(exec_match(wiki, pat), exec_search(wiki, pat_hat)) for (line, match1), (_, match2) in it: assert match1.groups() == match2.groups(), line else: message("same")
""" 55. 混同行列の作成 52で学習したロジスティック回帰モデルの混同行列(confusion matrix)を, 学習データおよび評価データ上で作成せよ. [MEMO] 2015 年版の knock77 に対応 """ import os import sys from sklearn.metrics import confusion_matrix from knock53 import load_dataset sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer # noqa: E402 isort:skip from kiyuna.utils.pickle import dump, load # noqa: E402 isort:skip if __name__ == "__main__": classifier = load("chap06-classifier") with Renderer("knock55") as out: for name in "train", "test": features, labels = load_dataset(f"./{name}.feature.txt") predicts = classifier.predict(features) out.result(name, confusion_matrix(labels, predicts))
def grid_search( model: Model, path_test: str, *, rng: Optional[Tuple[float, float, float]] = None, rng1: Optional[Tuple[float, float, float]] = None, rng2: Optional[Tuple[float, float, float]] = None, save: Optional[str] = None, ) -> Tuple[float, float]: def get_param(idx: np.ndarray) -> np.ndarray: return (np.array([rng1[0], rng2[0]]) + np.array([rng1[2], rng2[2]]) * idx) if rng: rng1 = rng2 = rng assert rng1 is not None assert rng2 is not None with Renderer("grid search") as out: cnt1 = len(np.arange(*rng1)) cnt2 = len(np.arange(*rng2)) E = np.zeros((cnt2, cnt1)) for j, λ_2 in enumerate(np.arange(*rng2)): message(f"{j + 1:2d} / {cnt2}", CR=True, type="status") for i, λ_1 in enumerate(np.arange(*rng1)): E[j, i] = model.test(path_test, λ_1=λ_1, λ_2=λ_2)["entropy_H"] message("", CR=True) ma_y, ma_x = np.where(E == E.max()) mi_y, mi_x = np.where(E == E.min()) out.result("max", (E.max(), get_param(np.hstack([ma_x, ma_y])))) out.result("min", (E.min(), get_param(np.hstack([mi_x, mi_y])))) if save: fig = plt.figure() ax = fig.add_subplot(111) mappable = ax.pcolor(E, cmap="jet", edgecolors="k", alpha=0.8) fig.colorbar(mappable) ax.scatter(ma_x + 0.5, ma_y + 0.5, c="r", label="max") ax.scatter(mi_x + 0.5, mi_y + 0.5, c="b", label="min") ax.set_xticks(np.arange(cnt1) + 0.5, minor=False) ax.set_yticks(np.arange(cnt2) + 0.5, minor=False) ax.set_xticklabels( map(lambda x: f"{x:.2f}"[1:], np.arange(*rng1)), minor=False, rotation=45, ) ax.set_yticklabels( map(lambda x: f"{x:.2f}"[1:], np.arange(*rng2)), minor=False, ) ax.set_title(f"エントロピー {get_ext(model.WittenBell)}") ax.set_xlabel("$λ_1$") ax.set_ylabel("$λ_2$") ax.set_aspect("equal") ax.legend(loc="lower right") plt.savefig(save) return get_param(np.hstack([mi_x, mi_y]))
https://nlp100.github.io/ja/ch04.html#33-aのb [Usage] python knock33.py """ import os import sys from typing import Dict, List from knock30 import mecab_into_sentences sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer # noqa: E402 isort:skip Morpheme = Dict[str, str] Sentence = List[Morpheme] if __name__ == "__main__": tgt = "AのB" res = [] for sentence in mecab_into_sentences(): for a, no, b in zip(sentence, sentence[1:], sentence[2:]): if (a["pos"], no["surface"], b["pos"]) == ("名詞", "の", "名詞"): res.append("".join(map(lambda x: x["surface"], (a, no, b)))) with Renderer(tgt) as out: out.result("数", len(res)) out.result("種類", len(set(res))) out.result("上から 10 個", res[:10])
[Command] wc (word count) -c バイト数を表示 -l 改行の数を表示する -m 文字数を表示する(マルチバイト文字に対応) -w 単語数を表示する [Usage] INPUT_PATH=./popular-names.txt python knock10.py $INPUT_PATH wc -l $INPUT_PATH cat $INPUT_PATH | wc -l diff -sw <(python knock10.py $INPUT_PATH) <(cat $INPUT_PATH | wc -l) """ import os import sys sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer, message # noqa: E402 isort:skip if __name__ == "__main__": path = sys.argv[1] with Renderer("knock10") as out, open(path) as f: out.result("generator", sum(1 for _ in f)) out.result("readlines", len(open(path).readlines())) out.result("read", len(open(path).read().rstrip("\n").split("\n"))) print(sum(1 for _ in open(path)))