Beispiel #1
0
def vectorize(labels, titles):

    encoder = LabelEncoder()
    encoder.fit(load("chap06-encoder-classes"))
    labels_ = encoder.transform(labels)

    vectorizer = TfidfVectorizer(vocabulary=load("chap06-vectorizer-vocabs"))
    vectorizer.fit(titles)
    features_ = vectorizer.transform(titles)

    return features_, labels_
Beispiel #2
0
"""
55. 混同行列の作成
52で学習したロジスティック回帰モデルの混同行列(confusion matrix)を,
学習データおよび評価データ上で作成せよ.

[MEMO]
2015 年版の knock77 に対応
"""
import os
import sys

from sklearn.metrics import confusion_matrix

from knock53 import load_dataset

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

if __name__ == "__main__":
    classifier = load("chap06-classifier")
    with Renderer("knock55") as out:
        for name in "train", "test":
            features, labels = load_dataset(f"./{name}.feature.txt")
            predicts = classifier.predict(features)
            out.result(name, confusion_matrix(labels, predicts))
Beispiel #3
0
    bs = trial.suggest_categorical("batch_size", [32, -1])
    model = run_train(train,
                      valid,
                      model,
                      epochs=11,
                      lr=lr,
                      batch_size=bs,
                      device=device)

    loss_eval, acc_eval = run_eval(model, valid, device=device)
    return acc_eval


if __name__ == "__main__":
    emb = torch.Tensor(d_w, V).normal_()
    wv = load("chap07-embeddings")
    for i, word in enumerate(_list_valid_words()):
        if word in wv:
            wv_word = wv[word]
            wv_word.flags["WRITEABLE"] = True
            emb[:, i] = torch.from_numpy(wv_word)

    train = torch.load("./data/train.pt")
    valid = torch.load("./data/valid.pt")
    test = torch.load("./data/test.pt")

    study = optuna.create_study()
    study.optimize(objective, n_trials=30)

    print("best_value:", study.best_value)
    print("best_params:", study.best_params)
Beispiel #4
0
[Usage]
python knock21.py
"""
import os
import re
import sys
from typing import Iterator

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip


def exec_fullmatch(wiki: str, pattern: str) -> Iterator[str]:
    reg = re.compile(pattern)
    for line in wiki.split("\n"):
        if reg.fullmatch(line):
            yield line


if __name__ == "__main__":
    wiki = load("UK")

    for category in exec_fullmatch(wiki, r"\[\[Category:.+\]\]"):
        print(category)

    with Renderer("MEMO") as out:
        lines = tuple(line for line in wiki.split("\n") if "Category" in line)
        out.result("Category を含む行", lines)
Beispiel #5
0
ベクトル空間上の国名に関する単語ベクトルをt-SNEで可視化せよ.

[Ref]
- TSNE
    - https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

[MEMO]
2015 年版の knock99 に対応
"""
import os
import sys
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip

if __name__ == "__main__":
    embeddings, country_names = load("chap07-embeddings-country")
    kmeans = load("chap07-kmeans")

    t_sne = TSNE(n_components=2).fit_transform(embeddings)

    plt.figure(figsize=(10, 5))
    cmap = plt.get_cmap("Set1")
    for name, coord, class_ in zip(country_names, t_sne, kmeans.labels_):
        cval = cmap(class_)
        plt.scatter(*coord, marker=".", color=cval, s=3)
        plt.annotate(name, xy=coord, color=cval, size=8)
    plt.savefig("out69.png")
Beispiel #6
0
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip
from kiyuna.utils.message import green  # noqa: E402 isort:skip


def remove_em(od: OrderedDict) -> OrderedDict:
    """remove emphasis expressions
        ''italics''
        '''bold'''
        '''''both'''''
    """
    res = OrderedDict()
    reg = re.compile(r"'{2,}")
    for key in od:
        res[key] = reg.sub("", od[key])
    return res


if __name__ == "__main__":
    infobox = load("infobox")
    res = remove_em(infobox)

    with Renderer("knock26") as out:
        for (key, src), (_, dst) in zip(infobox.items(), res.items()):
            if src == dst:
                out.cnt += 1
            else:
                out.result(key, (src, green(dst)))
        if infobox == res:
            message("変化なし", type="warning")
Beispiel #7
0
[MEMO]
2015 年版の knock73 に対応
"""
import os
import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

classes = load("chap06-encoder-classes")
categories = {
    "b": "business",
    "t": "science and technology",
    "e": "entertainment",
    "m": "health",
}


def vectorize(labels, titles):

    encoder = LabelEncoder()
    encoder.fit(load("chap06-encoder-classes"))
    labels_ = encoder.transform(labels)

    vectorizer = TfidfVectorizer(vocabulary=load("chap06-vectorizer-vocabs"))
Beispiel #8
0
国名に関する単語ベクトルに対し,Ward法による階層型クラスタリングを実行せよ.
さらに,クラスタリング結果をデンドログラムとして可視化せよ.

[Ref]
- linkage
    - https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
- dendrogram
    - https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html

[MEMO]
2015 年版の knock98 に対応
"""
import os
import sys
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip

if __name__ == "__main__":
    embeddings, country_names = load("chap07-embeddings-country")

    # Ward 法による階層型クラスタリング
    z = linkage(embeddings, method="ward")

    # クラスタリング結果をデンドログラムとして可視化
    plt.figure(figsize=(20, 10))
    dendrogram(z, labels=country_names, leaf_font_size=10)
    plt.savefig("out68.png")
Beispiel #9
0
def country_embeddings():
    wv = load("chap07-embeddings")
    country_names = np.array(list_country_names(), dtype=object)
    embeddings = [wv[country_name] for country_name in country_names]
    dump([embeddings, country_names], "chap07-embeddings-country")
    return embeddings, country_names
Beispiel #10
0
        if isinstance(x, dict):
            for k, v in x.items():
                flatten(v, names + [k])
        elif isinstance(x, list):
            for i, v in enumerate(x):
                flatten(v, names + [str(i)])
        else:
            res["__".join(names)] = x

    flatten(json_data)
    return res


if __name__ == "__main__":

    filename = load("infobox")["国旗画像"]

    # data = fetch_url_of_img_with_urllib(filename)
    data = fetch_url_of_img_with_requests(filename)
    # pprint.pprint(data, stream=sys.stderr)

    # url = data["query"]["pages"]["-1"]["imageinfo"][0]["url"]
    # url = flatten_json(data)["url"]
    # url = flatten__json(data)["query__pages__-1__imageinfo__0__url"]

    page: dict = next(iter(data["query"]["pages"].values()))
    image_info: dict = page["imageinfo"][0]
    url: str = image_info["url"]

    save_file_from_url(url, filename)
Beispiel #11
0
        with Renderer("個人メモ") as out:
            out.result(
                "type hints",
                (
                    typing.get_type_hints(list_word_freq),
                    build_word_frequency_cnter.__annotations__,
                ),
            )
            out.header("with 内で return しても大丈夫なはず")
            dis.dis(build_word_frequency_cnter, file=sys.stderr)
            out.header("doctest")
            doctest.testmod(verbose=True)
            out.header("check serialize")
            cnter = list_word_freq("../../test/00-input.txt")
            dump(cnter, "cnter")
            cnter = load("cnter")
        exit(0)

    with Renderer("単語の異なり数") as out:
        out.result("map", len(list_word_freq(path)))
        out.result("set", len(get_vocab(path)))

    num = 10
    with Renderer(f"数単語の頻度(上位 {num} 単語のみ)") as out:
        out.result(
            "大文字と小文字の区別をする",
            build_word_frequency_cnter(path, str).most_common(num),
        )
        trans = lambda w: w.lower()  # noqa: E731
        out.result(
            "大文字と小文字の区別をしない",
Beispiel #12
0
"""
57. 特徴量の重みの確認
52で学習したロジスティック回帰モデルの中で,
重みの高い特徴量トップ10と,重みの低い特徴量トップ10を確認せよ.

[MEMO]
2015 年版の knock75 に対応
"""
import os
import sys

from sklearn.metrics import precision_recall_fscore_support

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message, Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

if __name__ == "__main__":
    classifier = load("chap06-classifier")
    names = load("chap06-vectorizer-names")
    weights = classifier.coef_.flatten()
    ranking = sorted(zip(weights, names), reverse=True)
    with Renderer("knock57") as out:
        out.header("best 10")
        for weight, name in ranking[:10]:
            message(f"{name:15}{weight:f}")
        out.header("worst 10")
        for weight, name in ranking[:-11:-1]:
            message(f"{name:15}{weight:f}")