def vectorize(labels, titles): encoder = LabelEncoder() encoder.fit(load("chap06-encoder-classes")) labels_ = encoder.transform(labels) vectorizer = TfidfVectorizer(vocabulary=load("chap06-vectorizer-vocabs")) vectorizer.fit(titles) features_ = vectorizer.transform(titles) return features_, labels_
""" 55. 混同行列の作成 52で学習したロジスティック回帰モデルの混同行列(confusion matrix)を, 学習データおよび評価データ上で作成せよ. [MEMO] 2015 年版の knock77 に対応 """ import os import sys from sklearn.metrics import confusion_matrix from knock53 import load_dataset sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer # noqa: E402 isort:skip from kiyuna.utils.pickle import dump, load # noqa: E402 isort:skip if __name__ == "__main__": classifier = load("chap06-classifier") with Renderer("knock55") as out: for name in "train", "test": features, labels = load_dataset(f"./{name}.feature.txt") predicts = classifier.predict(features) out.result(name, confusion_matrix(labels, predicts))
bs = trial.suggest_categorical("batch_size", [32, -1]) model = run_train(train, valid, model, epochs=11, lr=lr, batch_size=bs, device=device) loss_eval, acc_eval = run_eval(model, valid, device=device) return acc_eval if __name__ == "__main__": emb = torch.Tensor(d_w, V).normal_() wv = load("chap07-embeddings") for i, word in enumerate(_list_valid_words()): if word in wv: wv_word = wv[word] wv_word.flags["WRITEABLE"] = True emb[:, i] = torch.from_numpy(wv_word) train = torch.load("./data/train.pt") valid = torch.load("./data/valid.pt") test = torch.load("./data/test.pt") study = optuna.create_study() study.optimize(objective, n_trials=30) print("best_value:", study.best_value) print("best_params:", study.best_params)
[Usage] python knock21.py """ import os import re import sys from typing import Iterator sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer, message # noqa: E402 isort:skip from kiyuna.utils.pickle import load # noqa: E402 isort:skip def exec_fullmatch(wiki: str, pattern: str) -> Iterator[str]: reg = re.compile(pattern) for line in wiki.split("\n"): if reg.fullmatch(line): yield line if __name__ == "__main__": wiki = load("UK") for category in exec_fullmatch(wiki, r"\[\[Category:.+\]\]"): print(category) with Renderer("MEMO") as out: lines = tuple(line for line in wiki.split("\n") if "Category" in line) out.result("Category を含む行", lines)
ベクトル空間上の国名に関する単語ベクトルをt-SNEで可視化せよ. [Ref] - TSNE - https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html [MEMO] 2015 年版の knock99 に対応 """ import os import sys from matplotlib import pyplot as plt from sklearn.manifold import TSNE sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.pickle import load # noqa: E402 isort:skip if __name__ == "__main__": embeddings, country_names = load("chap07-embeddings-country") kmeans = load("chap07-kmeans") t_sne = TSNE(n_components=2).fit_transform(embeddings) plt.figure(figsize=(10, 5)) cmap = plt.get_cmap("Set1") for name, coord, class_ in zip(country_names, t_sne, kmeans.labels_): cval = cmap(class_) plt.scatter(*coord, marker=".", color=cval, s=3) plt.annotate(name, xy=coord, color=cval, size=8) plt.savefig("out69.png")
from kiyuna.utils.pickle import load # noqa: E402 isort:skip from kiyuna.utils.message import Renderer, message # noqa: E402 isort:skip from kiyuna.utils.message import green # noqa: E402 isort:skip def remove_em(od: OrderedDict) -> OrderedDict: """remove emphasis expressions ''italics'' '''bold''' '''''both''''' """ res = OrderedDict() reg = re.compile(r"'{2,}") for key in od: res[key] = reg.sub("", od[key]) return res if __name__ == "__main__": infobox = load("infobox") res = remove_em(infobox) with Renderer("knock26") as out: for (key, src), (_, dst) in zip(infobox.items(), res.items()): if src == dst: out.cnt += 1 else: out.result(key, (src, green(dst))) if infobox == res: message("変化なし", type="warning")
[MEMO] 2015 年版の knock73 に対応 """ import os import sys from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer # noqa: E402 isort:skip from kiyuna.utils.pickle import dump, load # noqa: E402 isort:skip classes = load("chap06-encoder-classes") categories = { "b": "business", "t": "science and technology", "e": "entertainment", "m": "health", } def vectorize(labels, titles): encoder = LabelEncoder() encoder.fit(load("chap06-encoder-classes")) labels_ = encoder.transform(labels) vectorizer = TfidfVectorizer(vocabulary=load("chap06-vectorizer-vocabs"))
国名に関する単語ベクトルに対し,Ward法による階層型クラスタリングを実行せよ. さらに,クラスタリング結果をデンドログラムとして可視化せよ. [Ref] - linkage - https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html - dendrogram - https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html [MEMO] 2015 年版の knock98 に対応 """ import os import sys from matplotlib import pyplot as plt from scipy.cluster.hierarchy import linkage, dendrogram sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.pickle import load # noqa: E402 isort:skip if __name__ == "__main__": embeddings, country_names = load("chap07-embeddings-country") # Ward 法による階層型クラスタリング z = linkage(embeddings, method="ward") # クラスタリング結果をデンドログラムとして可視化 plt.figure(figsize=(20, 10)) dendrogram(z, labels=country_names, leaf_font_size=10) plt.savefig("out68.png")
def country_embeddings(): wv = load("chap07-embeddings") country_names = np.array(list_country_names(), dtype=object) embeddings = [wv[country_name] for country_name in country_names] dump([embeddings, country_names], "chap07-embeddings-country") return embeddings, country_names
if isinstance(x, dict): for k, v in x.items(): flatten(v, names + [k]) elif isinstance(x, list): for i, v in enumerate(x): flatten(v, names + [str(i)]) else: res["__".join(names)] = x flatten(json_data) return res if __name__ == "__main__": filename = load("infobox")["国旗画像"] # data = fetch_url_of_img_with_urllib(filename) data = fetch_url_of_img_with_requests(filename) # pprint.pprint(data, stream=sys.stderr) # url = data["query"]["pages"]["-1"]["imageinfo"][0]["url"] # url = flatten_json(data)["url"] # url = flatten__json(data)["query__pages__-1__imageinfo__0__url"] page: dict = next(iter(data["query"]["pages"].values())) image_info: dict = page["imageinfo"][0] url: str = image_info["url"] save_file_from_url(url, filename)
with Renderer("個人メモ") as out: out.result( "type hints", ( typing.get_type_hints(list_word_freq), build_word_frequency_cnter.__annotations__, ), ) out.header("with 内で return しても大丈夫なはず") dis.dis(build_word_frequency_cnter, file=sys.stderr) out.header("doctest") doctest.testmod(verbose=True) out.header("check serialize") cnter = list_word_freq("../../test/00-input.txt") dump(cnter, "cnter") cnter = load("cnter") exit(0) with Renderer("単語の異なり数") as out: out.result("map", len(list_word_freq(path))) out.result("set", len(get_vocab(path))) num = 10 with Renderer(f"数単語の頻度(上位 {num} 単語のみ)") as out: out.result( "大文字と小文字の区別をする", build_word_frequency_cnter(path, str).most_common(num), ) trans = lambda w: w.lower() # noqa: E731 out.result( "大文字と小文字の区別をしない",
""" 57. 特徴量の重みの確認 52で学習したロジスティック回帰モデルの中で, 重みの高い特徴量トップ10と,重みの低い特徴量トップ10を確認せよ. [MEMO] 2015 年版の knock75 に対応 """ import os import sys from sklearn.metrics import precision_recall_fscore_support sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import message, Renderer # noqa: E402 isort:skip from kiyuna.utils.pickle import dump, load # noqa: E402 isort:skip if __name__ == "__main__": classifier = load("chap06-classifier") names = load("chap06-vectorizer-names") weights = classifier.coef_.flatten() ranking = sorted(zip(weights, names), reverse=True) with Renderer("knock57") as out: out.header("best 10") for weight, name in ranking[:10]: message(f"{name:15}{weight:f}") out.header("worst 10") for weight, name in ranking[:-11:-1]: message(f"{name:15}{weight:f}")