コード例 #1
0
ファイル: knock50.py プロジェクト: tmu-nlp/100knock2020
def write_file(filename, rng):
    category_cnter = defaultdict(int)
    with open(filename + ".txt", "w") as f:
        for i in rng:
            print(data[i]["CATEGORY"], data[i]["TITLE"], sep="\t", file=f)
            category_cnter[data[i]["CATEGORY"]] += 1
    with Renderer(filename) as out:
        for tag, name in categories.items():
            out.result(name, category_cnter[tag])
コード例 #2
0
def build_cnter(query: dict, *, verbose=False) -> Counter[str]:
    [(tgt_key, tgt_val)] = query.items()

    cnter = Counter()
    for sentence in tqdm(mecab_into_sentences()):
        cnter += Counter(d[tgt_key] for d in sentence)

    if verbose:
        with Renderer(f"「{tgt_val}」の出現頻度") as out:
            out.header("上位 10 個")
            pprint.pprint(cnter.most_common(10), stream=sys.stderr)
            out.result("種類", len(cnter))

    return cnter
コード例 #3
0
ファイル: knock30.py プロジェクト: tmu-nlp/100knock2020
def test_extract(query: dict, *, verbose=False) -> list:
    [(src_key, src_val)] = query["src"].items()
    [(dst_key, dst_val)] = query["dst"].items()

    res = []
    for sentence in mecab_into_sentences():
        res.extend([d[dst_key] for d in sentence if d[src_key] == src_val])

    if verbose:
        with Renderer(f"「{src_val}」の「{dst_val}」") as out:
            out.result("数", len(res))
            out.result("種類", len(set(res)))
            out.result("上から 10 個", res[:10])

    return res
コード例 #4
0
[Usage]
python knock03.py
"""
import doctest
import os
import re
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip

if __name__ == "__main__":
    doctest.testmod(verbose=True)

    s = ("Now I need a drink, alcoholic of course, "
         "after the heavy lectures involving quantum mechanics.")

    with Renderer("knock03") as out:
        out.result(
            "replace + list comprehension",
            [len(w) for w in s.replace(",", "").replace(".", "").split()],
        )
        out.result(
            "rstrip + list comprehension",
            [len(w.rstrip(",.")) for w in s.split()],
        )
        out.result("re.findall + map", list(map(len, re.findall(r"\w+", s))))
        out.result("re.sub + map", [*map(len, re.sub(r"[,.]", "", s).split())])
        out.result("re.split + map", [*map(len, re.split(r"\W+", s)[:-1])])
コード例 #5
0
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip

if __name__ == "__main__":

    w1 = "paraparaparadise"
    w2 = "paragraph"
    n = 2

    X = set(n_gram(w1, n))
    Y = set(n_gram(w2, n))
    tgt = n_gram("se", n).pop()

    print("X =", X)
    print("Y =", Y)

    print("X ∪ Y = {}".format(X | Y))  # X.union(Y)
    print("X ∩ Y = {}".format(X & Y))  # X.intersection(Y)
    print("X \\ Y = {}".format(X - Y))  # X.difference(Y)
    print("Y \\ X = {}".format(Y - X))  # Y.difference(X)

    print(f"X includes 'se': {tgt in X}")
    print(f"Y includes 'se': {tgt in Y}")

    with Renderer("MEMO") as out:
        out.result(r"X ∪ Y", X.union(Y))
        out.result(f"X ∩ Y", X.intersection(Y))
        out.result(rf"X \ Y", X.difference(n_gram(w2, n)))
        out.result(fr"Y \ X", Y.difference(n_gram(w1, n)))
コード例 #6
0
ファイル: knock22.py プロジェクト: tmu-nlp/100knock2020
from typing import Iterator, Match, Tuple

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip


def exec_search(wiki: str, pattern: str) -> Iterator[Tuple[str, Match]]:
    reg = re.compile(pattern)
    for line in wiki.split("\n"):
        match = reg.search(line)
        if match:
            yield line, match


if __name__ == "__main__":
    wiki = load("UK")

    pat_category_only = r"\[\[Category:(?P<Category_name>.+?)(\||])"
    for _, match in exec_search(wiki, pat_category_only):
        print(match.group("Category_name"))

    pats = (
        pat_category_only,
        r"\[\[Category:(?P<Category_name>[^|]+)\|*(?P<Sortkey>.*)\]\]",
    )
    for pat in pats:
        with Renderer(pat) as out:
            for line, match in exec_search(wiki, pat):
                out.result(line, match.groups())
コード例 #7
0
ファイル: knock24.py プロジェクト: tmu-nlp/100knock2020
def exec_findall(wiki: str, pattern: str) -> Iterator[Tuple[str, Group]]:
    reg = re.compile(pattern)
    for line in wiki.split("\n"):
        for match in reg.findall(line):
            yield line, match


if __name__ == "__main__":
    wiki = load("UK")

    pat = (
        r"(?:\s=\s)?"  # 「基礎情報 国]」対策
        r"([^:=]+)"  # '/' を [^] の中に追加すると <ref> 内のファイル名も取得できる
        r"\.(?i)(png|gif|jpg|jpeg|xcf|pdf|mid|ogg|svg|djvu)")
    with Renderer("knock24") as out:
        for line, filename in exec_findall(wiki, pat):
            fname = ".".join(filename)
            if "/" not in fname:  # <ref> 対策
                out.result(trunc(line), green(fname))
    """ NOTE
    - ウィキペディアの画像
        - [[ファイル:Uk topo en.jpg|thumb|200px|イギリスの地形図]]
    - 基礎情報 国
        - |国旗画像 = Flag of the United Kingdom.svg
    - <gallery>
        - Stonehenge2007 07 30.jpg|[[ストーンヘンジ]]
    - <ref>
        - <ref>[http://warp.da.ndl.go.jp/.../country.pdf
    """
コード例 #8
0
[Ref]
- reversed
    - https://docs.python.org/ja/3/library/functions.html#reversed
- string は変更不能なシーケンス型
    - https://docs.python.org/ja/3/reference/datamodel.html

[Usage]
python knock00.py
"""
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip

if __name__ == "__main__":

    s = "stressed"

    with Renderer("knock00") as out:
        out.result("slice", s[::-1])
        out.result("reversed", "".join(reversed(s)))

    with Renderer("MEMO") as out:
        out.result("reversed の返り値は", reversed(s))
        try:
            s.reverse()
        except AttributeError as e:
            out.result("string は変更不能なシーケンス型(immutable sequence)", e)
コード例 #9
0
[MEMO]
2015 年版の knock94-95 に対応
"""
import os
import sys
from zipfile import ZipFile

from scipy.stats import spearmanr

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message, Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip

if __name__ == "__main__":
    wv = load("chap07-embeddings")
    preds, labels = [], []
    with ZipFile("wordsim353.zip") as myzip:
        message(myzip.infolist())
        with myzip.open("combined.csv") as myfile:
            myfile = map(lambda x: x.decode(), myfile)
            message("[header]", next(myfile))  # Word 1,Word 2,Human (mean)
            for line in myfile:
                word1, word2, human = line.split(",")
                preds.append(wv.similarity(word1, word2))
                labels.append(human)
    with Renderer("knock66") as out:
        out.result("Spearman corr", spearmanr(preds, labels)[0])
"""result
0.6849564489532376
"""
コード例 #10
0
    return list(country_names)


def country_embeddings():
    wv = load("chap07-embeddings")
    country_names = np.array(list_country_names(), dtype=object)
    embeddings = [wv[country_name] for country_name in country_names]
    dump([embeddings, country_names], "chap07-embeddings-country")
    return embeddings, country_names


if __name__ == "__main__":
    embeddings, country_names = country_embeddings()
    kmeans = KMeans(n_clusters=5).fit(embeddings)
    dump(kmeans, "chap07-kmeans")
    with Renderer("knock67", start=0) as out:
        for i in range(5):
            out.result(f"Class {i}", country_names[kmeans.labels_ == i])
"""result
[*]  0. Class 0
['Bhutan' 'Bahrain' 'Japan' 'Morocco' 'Indonesia' 'Pakistan' 'Thailand'
 'Tunisia' 'Oman' 'Egypt' 'Turkey' 'Qatar' 'Iraq' 'Laos' 'Libya' 'Lebanon'
 'Jordan' 'Afghanistan' 'Bangladesh' 'Syria' 'Nepal' 'China' 'Vietnam'
 'Iran']
[*]  1. Class 1
['Samoa' 'Chile' 'Dominica' 'Australia' 'Ecuador' 'Fiji' 'Bahamas'
 'Canada' 'Jamaica' 'Nicaragua' 'Cuba' 'Peru' 'Venezuela' 'Uruguay'
 'Guyana' 'Honduras' 'Belize' 'Greenland' 'Philippines' 'Taiwan' 'Tuvalu'
 'Suriname']
[*]  2. Class 2
['Ghana' 'Malawi' 'Gabon' 'Gambia' 'Namibia' 'Guinea' 'Uganda' 'Somalia'
コード例 #11
0
def build_word_frequency_cnter(path: str, trans: F = str) -> Counter[T]:
    with open(path) as f:
        return collections.Counter(map(trans, f.read().split()))


def get_vocab(path: str, trans: F = str) -> Set[T]:
    with open(path) as f:
        return {trans(w) for w in f.read().split()}


if __name__ == "__main__":
    path = sys.argv[1]

    if path == "MEMO":
        with Renderer("個人メモ") as out:
            out.result(
                "type hints",
                (
                    typing.get_type_hints(list_word_freq),
                    build_word_frequency_cnter.__annotations__,
                ),
            )
            out.header("with 内で return しても大丈夫なはず")
            dis.dis(build_word_frequency_cnter, file=sys.stderr)
            out.header("doctest")
            doctest.testmod(verbose=True)
            out.header("check serialize")
            cnter = list_word_freq("../../test/00-input.txt")
            dump(cnter, "cnter")
            cnter = load("cnter")
コード例 #12
0
ファイル: knock08.py プロジェクト: tmu-nlp/100knock2020
[Usage]
python knock08.py
"""
import os
import string
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip


def rot13(text: str) -> str:
    return "".join(chr(219 - ord(c)) if c.islower() else c for c in text)


if __name__ == "__main__":

    d = {
        "printable": string.printable.strip(),
        "ascii_lowercase": string.ascii_lowercase,
    }

    for title, s in d.items():
        with Renderer(title) as out:
            out.result("plaintext", s)
            out.header("encode")
            out.result("ciphertext", rot13(s))
            out.header("decode")
            out.result("plaintext", rot13(rot13(s)))
コード例 #13
0
        right = get_S_expr(best_edge[sym_ij][1], best_edge, words)
        return f"({sym} {left} {right})"
    else:
        return f"({sym} {words[int(i)]})"


if __name__ == "__main__":
    if sys.argv[1] == "test":
        grammar_file = "../../test/08-grammar.txt"
        input_file = "../../test/08-input.txt"
    else:
        grammar_file = "../../data/wiki-en-test.grammar"
        input_file = "../../data/wiki-en-short.tok"

    s, t = 0, 1
    with Renderer(sys.argv[1]) as out:
        for i, s_expr in enumerate(cky(grammar_file, input_file, s=s, t=t)):
            message("=" * 3, "line:", s + i, "=" * 3)
            tree = Tree.fromstring(s_expr)
            out.result("S-expression", s_expr)
            out.result("nltk.tree.Tree", tree)
            out.header("nltk.tree.Tree.pretty_print")
            tree.pretty_print()
            # tree.draw()
"""result
[+] main
=== line: 0 ===
[*]  1. S-expression
(S (PP (IN Among) (NP (DT these) (NP' (, ,) (NP' (JJ supervised) (NP' (NN learning) (NNS approaches)))))) (S' (VP (VBP have) (VP (VBN been) (VP' (NP (DT the) (NP' (ADJP (RBS most) (JJ successful)) (NNS algorithms))) (PP (TO to) (NP_NN date))))) (. .)))
[*]  2. nltk.tree.Tree
(S
コード例 #14
0
ファイル: knock23.py プロジェクト: tmu-nlp/100knock2020
def exec_match(wiki: str, pattern: str) -> Iterator[Tuple[str, Match]]:
    reg = re.compile(pattern)
    for line in wiki.split("\n"):
        match = reg.match(line)
        if match:
            yield line, match


if __name__ == "__main__":
    wiki = load("UK")

    pat = r"(?P<Level>=+)\s*(?P<Heading>.+)\s*(?P=Level)"
    for _, match in exec_match(wiki, pat):
        level, heading = match.group(1, 2)
        print(
            "  " * (len(level) - 2),
            "+",
            f" lv{len(level) - 1} ",
            heading,
            sep="",
        )

    with Renderer("re.match() vs. re.search()") as out:
        pat_hat = r"^" + pat
        it = zip(exec_match(wiki, pat), exec_search(wiki, pat_hat))
        for (line, match1), (_, match2) in it:
            assert match1.groups() == match2.groups(), line
        else:
            message("same")
コード例 #15
0
ファイル: knock55.py プロジェクト: tmu-nlp/100knock2020
"""
55. 混同行列の作成
52で学習したロジスティック回帰モデルの混同行列(confusion matrix)を,
学習データおよび評価データ上で作成せよ.

[MEMO]
2015 年版の knock77 に対応
"""
import os
import sys

from sklearn.metrics import confusion_matrix

from knock53 import load_dataset

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

if __name__ == "__main__":
    classifier = load("chap06-classifier")
    with Renderer("knock55") as out:
        for name in "train", "test":
            features, labels = load_dataset(f"./{name}.feature.txt")
            predicts = classifier.predict(features)
            out.result(name, confusion_matrix(labels, predicts))
コード例 #16
0
ファイル: tutorial02.py プロジェクト: tmu-nlp/NLPtutorial2020
def grid_search(
    model: Model,
    path_test: str,
    *,
    rng: Optional[Tuple[float, float, float]] = None,
    rng1: Optional[Tuple[float, float, float]] = None,
    rng2: Optional[Tuple[float, float, float]] = None,
    save: Optional[str] = None,
) -> Tuple[float, float]:
    def get_param(idx: np.ndarray) -> np.ndarray:
        return (np.array([rng1[0], rng2[0]]) +
                np.array([rng1[2], rng2[2]]) * idx)

    if rng:
        rng1 = rng2 = rng
    assert rng1 is not None
    assert rng2 is not None

    with Renderer("grid search") as out:
        cnt1 = len(np.arange(*rng1))
        cnt2 = len(np.arange(*rng2))
        E = np.zeros((cnt2, cnt1))
        for j, λ_2 in enumerate(np.arange(*rng2)):
            message(f"{j + 1:2d} / {cnt2}", CR=True, type="status")
            for i, λ_1 in enumerate(np.arange(*rng1)):
                E[j, i] = model.test(path_test, λ_1=λ_1, λ_2=λ_2)["entropy_H"]
        message("", CR=True)

        ma_y, ma_x = np.where(E == E.max())
        mi_y, mi_x = np.where(E == E.min())
        out.result("max", (E.max(), get_param(np.hstack([ma_x, ma_y]))))
        out.result("min", (E.min(), get_param(np.hstack([mi_x, mi_y]))))

    if save:
        fig = plt.figure()
        ax = fig.add_subplot(111)

        mappable = ax.pcolor(E, cmap="jet", edgecolors="k", alpha=0.8)
        fig.colorbar(mappable)

        ax.scatter(ma_x + 0.5, ma_y + 0.5, c="r", label="max")
        ax.scatter(mi_x + 0.5, mi_y + 0.5, c="b", label="min")

        ax.set_xticks(np.arange(cnt1) + 0.5, minor=False)
        ax.set_yticks(np.arange(cnt2) + 0.5, minor=False)
        ax.set_xticklabels(
            map(lambda x: f"{x:.2f}"[1:], np.arange(*rng1)),
            minor=False,
            rotation=45,
        )
        ax.set_yticklabels(
            map(lambda x: f"{x:.2f}"[1:], np.arange(*rng2)),
            minor=False,
        )
        ax.set_title(f"エントロピー {get_ext(model.WittenBell)}")
        ax.set_xlabel("$λ_1$")
        ax.set_ylabel("$λ_2$")
        ax.set_aspect("equal")
        ax.legend(loc="lower right")
        plt.savefig(save)

    return get_param(np.hstack([mi_x, mi_y]))
コード例 #17
0
https://nlp100.github.io/ja/ch04.html#33-aのb

[Usage]
python knock33.py
"""
import os
import sys
from typing import Dict, List

from knock30 import mecab_into_sentences

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer  # noqa: E402 isort:skip

Morpheme = Dict[str, str]
Sentence = List[Morpheme]

if __name__ == "__main__":
    tgt = "AのB"

    res = []
    for sentence in mecab_into_sentences():
        for a, no, b in zip(sentence, sentence[1:], sentence[2:]):
            if (a["pos"], no["surface"], b["pos"]) == ("名詞", "の", "名詞"):
                res.append("".join(map(lambda x: x["surface"], (a, no, b))))

    with Renderer(tgt) as out:
        out.result("数", len(res))
        out.result("種類", len(set(res)))
        out.result("上から 10 個", res[:10])
コード例 #18
0
[Command]
wc (word count)
    -c バイト数を表示
    -l 改行の数を表示する
    -m 文字数を表示する(マルチバイト文字に対応)
    -w 単語数を表示する

[Usage]
INPUT_PATH=./popular-names.txt
python knock10.py $INPUT_PATH
wc -l $INPUT_PATH
cat $INPUT_PATH | wc -l
diff -sw <(python knock10.py $INPUT_PATH) <(cat $INPUT_PATH | wc -l)
"""
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip

if __name__ == "__main__":
    path = sys.argv[1]

    with Renderer("knock10") as out, open(path) as f:
        out.result("generator", sum(1 for _ in f))
        out.result("readlines", len(open(path).readlines()))
        out.result("read", len(open(path).read().rstrip("\n").split("\n")))

    print(sum(1 for _ in open(path)))