コード例 #1
0
ファイル: knock29.py プロジェクト: tmu-nlp/100knock2020
def render_html(path_img, path_html="out29.html") -> None:
    message("save :", path_html, type="status")
    contents = ("<!DOCTYPE html><html>"
                "<head><title>knock29</title></head>"
                '<body><img src="%s" width="128"/></body>'
                "</html>" % path_img)
    with open(path_html, "w") as f:
        f.write(contents)

    message("open :", path_html, type="status")
    webbrowser.open(path_html)
コード例 #2
0
 def tag(self, path_input: str, path_output: str, **kwargs) -> None:
     self.params.update(**kwargs)
     res = []
     with open(path_input) as f_in:
         for line in tqdm.tqdm(f_in):
             words = [self.trans(word) for word in line.split()]
             best_edge = self.__forward(words)
             tags = self.__backward(words, best_edge)
             res.append(" ".join(tags) + "\n")
     with open(path_output, "w") as f_out:
         f_out.writelines(res)
     message(f"saved : {path_output}", type="success")
コード例 #3
0
def sample(test_path, epochs=1, α=0.01, β=0.01, num_topics=2):
    """ #09 p24 """
    xcorpus, ycorpus, xcounts, ycounts, wordtype = initialize(test_path, num_topics)
    for epoch in range(1, epochs + 1):
        message("epoch =", epoch, type="status")
        ll = 0
        for i in tqdm(range(len(xcorpus)), leave=False):
            for j in range(len(xcorpus[i])):
                x = xcorpus[i][j]
                y = ycorpus[i][j]
                add_counts(xcounts, ycounts, x, y, i, -1)
                probs = []
                for k in range(num_topics):
                    p_xk = (xcounts[f"{x}|{k}"] + α) / (xcounts[k] + α * wordtype)
                    p_ky = (ycounts[f"{k}|{i}"] + β) / (ycounts[i] + β * num_topics)
                    probs.append(p_xk * p_ky)
                new_y = sampleone(probs)
                ll += math.log(probs[new_y])
                add_counts(xcounts, ycounts, x, new_y, i, 1)
                ycorpus[i][j] = new_y
        message("ll =", ll, type="success")
    return xcorpus, ycorpus
コード例 #4
0
ファイル: tutorial02.py プロジェクト: tmu-nlp/NLPtutorial2020
def test(args: argparse.Namespace) -> None:
    model = BigramLM(WittenBell=args.WittenBell).load(args.model)

    res = model.test(args.test)
    if args.name:
        message(
            f"[{args.name} | {get_ext(args.WittenBell)}"
            f" default(λ_1={0.95:.2f}, λ_2={0.95:.2f})]",
            file=sys.stdout,
        )
    for k, v in res.items():
        message(f"{k:15s} = {v:f}", file=sys.stdout)

    # λ_1, λ_2 = grid_search(
    #     model,
    #     args.test,
    #     rng=(0.01, 1, 0.01),
    #     save=f"result_{get_ext(args.WittenBell)}.png",
    # )
    λ_1, λ_2 = grid_search(
        model,
        args.test,
        rng=(0.05, 1, 0.05),
        save=f"fig1_{get_ext(args.WittenBell)}.png",
    )
    # λ_1, λ_2 = grid_search(
    #     model,
    #     args.test,
    #     rng1=(λ_1 - 0.1, λ_1 + 0.1, 0.01),
    #     rng2=(λ_2 - 0.1, λ_2 + 0.1, 0.01),
    #     save=f"fig2_{get_ext(args.WittenBell)}.png",
    # )

    res = model.test(args.test, λ_1=λ_1, λ_2=λ_2)
    if args.name:
        message(
            f"[{args.name} | {get_ext(args.WittenBell)}"
            f" optimized(λ_1={λ_1:.2f}, λ_2={λ_2:.2f})]",
            file=sys.stdout,
        )
    for k, v in res.items():
        message(f"{k:15s} = {v:f}", file=sys.stdout)
    """result
コード例 #5
0
ファイル: tutorial03.py プロジェクト: tmu-nlp/NLPtutorial2020
class Tokenizer(UnigramLM):
    def tokenize(
        self,
        path_input: str,
        path_output: str,
        *,
        λ_1: float = 0.95,
        vocab_size: int = 1_000_000,
    ) -> None:
        def forward(line: str, V: int = vocab_size) -> List[Tuple[int, int]]:
            size = len(line)
            best_edge = [None] * (size + 1)
            best_score = [float("inf")] * (size + 1)
            best_score[0] = 0
            for word_end in range(1, size + 1):
                for word_begin in range(size):
                    word = line[word_begin:word_end]
                    if word in self.model or len(word) == 1:
                        prob = λ_1 * self.model.get(word, 0) + (1 - λ_1) / V
                        my_score = best_score[word_begin] + -math.log2(prob)
                        if my_score < best_score[word_end]:
                            best_score[word_end] = my_score
                            best_edge[word_end] = (word_begin, word_end)
            return best_edge

        def backward(line: str, best_edge: List[Tuple[int, int]]) -> List[str]:
            words = []
            next_edge = best_edge[-1]
            while next_edge:
                words.append(line[next_edge[0]:next_edge[1]])
                next_edge = best_edge[next_edge[0]]
            words.reverse()
            return words

        res = []
        with open(path_input) as f_in:
            for line in map(lambda x: x.strip(), f_in):
                best_edge = forward(line)
                words = backward(line, best_edge)
                res.append(" ".join(words) + "\n")
        with open(path_output, "w") as f_out:
            f_out.writelines(res)
        message(f"saved {path_output}", type="success")
コード例 #6
0
ファイル: tutorial01.py プロジェクト: tmu-nlp/NLPtutorial2020
def test(args: argparse.Namespace) -> None:
    model = UnigramLM().load(args.model)

    res = model.test(args.test)
    if args.name:
        message(f"[{args.name} | default(λ_1={0.95:.2f})]", file=sys.stdout)
    for k, v in res.items():
        message(f"{k:15s} = {v:f}", file=sys.stdout)

    entropy, λ_1 = min(
        (model.test(args.test, λ_unk=1 - λ_1)["entropy_H"], λ_1)
        for λ_1 in np.arange(0, 1, 0.01)
    )

    res = model.test(args.test, λ_unk=1 - λ_1)
    if args.name:
        message(
            f"[{args.name} | optimized(λ_1={λ_1:.2f})]", file=sys.stdout,
        )
    for k, v in res.items():
        message(f"{k:15s} = {v:f}", file=sys.stdout)

    """result
コード例 #7
0
ファイル: tutorial02.py プロジェクト: tmu-nlp/NLPtutorial2020
def grid_search(
    model: Model,
    path_test: str,
    *,
    rng: Optional[Tuple[float, float, float]] = None,
    rng1: Optional[Tuple[float, float, float]] = None,
    rng2: Optional[Tuple[float, float, float]] = None,
    save: Optional[str] = None,
) -> Tuple[float, float]:
    def get_param(idx: np.ndarray) -> np.ndarray:
        return (np.array([rng1[0], rng2[0]]) +
                np.array([rng1[2], rng2[2]]) * idx)

    if rng:
        rng1 = rng2 = rng
    assert rng1 is not None
    assert rng2 is not None

    with Renderer("grid search") as out:
        cnt1 = len(np.arange(*rng1))
        cnt2 = len(np.arange(*rng2))
        E = np.zeros((cnt2, cnt1))
        for j, λ_2 in enumerate(np.arange(*rng2)):
            message(f"{j + 1:2d} / {cnt2}", CR=True, type="status")
            for i, λ_1 in enumerate(np.arange(*rng1)):
                E[j, i] = model.test(path_test, λ_1=λ_1, λ_2=λ_2)["entropy_H"]
        message("", CR=True)

        ma_y, ma_x = np.where(E == E.max())
        mi_y, mi_x = np.where(E == E.min())
        out.result("max", (E.max(), get_param(np.hstack([ma_x, ma_y]))))
        out.result("min", (E.min(), get_param(np.hstack([mi_x, mi_y]))))

    if save:
        fig = plt.figure()
        ax = fig.add_subplot(111)

        mappable = ax.pcolor(E, cmap="jet", edgecolors="k", alpha=0.8)
        fig.colorbar(mappable)

        ax.scatter(ma_x + 0.5, ma_y + 0.5, c="r", label="max")
        ax.scatter(mi_x + 0.5, mi_y + 0.5, c="b", label="min")

        ax.set_xticks(np.arange(cnt1) + 0.5, minor=False)
        ax.set_yticks(np.arange(cnt2) + 0.5, minor=False)
        ax.set_xticklabels(
            map(lambda x: f"{x:.2f}"[1:], np.arange(*rng1)),
            minor=False,
            rotation=45,
        )
        ax.set_yticklabels(
            map(lambda x: f"{x:.2f}"[1:], np.arange(*rng2)),
            minor=False,
        )
        ax.set_title(f"エントロピー {get_ext(model.WittenBell)}")
        ax.set_xlabel("$λ_1$")
        ax.set_ylabel("$λ_2$")
        ax.set_aspect("equal")
        ax.legend(loc="lower right")
        plt.savefig(save)

    return get_param(np.hstack([mi_x, mi_y]))
コード例 #8
0
ファイル: knock29.py プロジェクト: tmu-nlp/100knock2020
def fetch_url_of_img_with_urllib(filename: str) -> dict:
    message("fetch:", f"url of `{filename}`", type="status")
    url_ = url + "?%s" % urllib.parse.urlencode(make_payload(filename))
    with urllib.request.urlopen(url_) as f:
        return json.loads(f.read().decode("utf-8"))
コード例 #9
0
 def train(self, path_corpus: str) -> "POSTagger":
     self.probs, self.possible_tags = self.__build_model(path_corpus)
     message(f"train model from {path_corpus}", type="success")
     return self
コード例 #10
0
 def load(self, path_model: str) -> "POSTagger":
     self.probs, self.possible_tags = self.__load_model(path_model)
     message(f"load  model from {path_model}", type="success")
     return self
コード例 #11
0
 def __exit__(self, *args) -> None:
     message("saved :", self.name, "\n", CR=True, type="success")
コード例 #12
0
    for k, v in cnter.items():
        idx = v.index(max(v))
        groups[idx].append((v[idx], k))
        tmp.append((v[idx], k))
    print_cnt = min(300, len(tmp))
    th = sorted(tmp, reverse=True)[print_cnt - 1][0]
    for i, group in enumerate(groups):
        print("=" * 5, i, "=" * 5)
        group.sort(reverse=True)
        res = [word for freq, word in group if word not in stop_words and freq >= th]
        pprint(res, width=80, compact=True)


if __name__ == "__main__":
    if sys.argv[1:] == ["test"]:
        message("test", type="status")
        learn_lda(test_path="../../test/07-train.txt", epochs=50)
    else:
        message("main", type="status")
        stop_words = nltk.corpus.stopwords.words("english")
        symbols = [
            "'",
            '"',
            ":",
            ";",
            ".",
            ",",
            "-",
            "!",
            "?",
            ")",
コード例 #13
0
ファイル: knock43.py プロジェクト: tmu-nlp/100knock2020
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message  # noqa: E402 isort:skip


class ChunkNormalized(Chunk):
    def __init__(self, chunk):
        self.morphs, self.dst, self.srcs = (*chunk, )
        self.norm = self.get_norm()

    def get_norm(self):
        clause = "".join(m.surface for m in self.morphs if m.pos != "記号")
        return clause

    def has_pos(self, pos):
        return any(m.pos == pos for m in self.morphs)


if __name__ == "__main__":
    res = []
    for chunks in cabocha_into_chunks():
        chunks = {k: ChunkNormalized(v) for k, v in chunks.items()}
        for c in chunks.values():
            if c.dst == -1:
                continue
            if c.dst not in chunks:
                continue
            if c.has_pos("名詞") and chunks[c.dst].has_pos("動詞"):
                res.append(f"{c.norm}\t{chunks[c.dst].norm}\n")
    sys.stdout.writelines(res)
    message(f"write {len(res)} lines", type="success")
コード例 #14
0
ファイル: time.py プロジェクト: tmu-nlp/100knock2020
 def __exit__(self, *args) -> None:
     self.end = time.time()
     self.secs = self.end - self.start
     self.msecs = self.secs * 1000
     if self.verbose:
         message(f"elapsed time = {self.msecs:f} [msec]", type="success")
コード例 #15
0
ファイル: tutorial02.py プロジェクト: tmu-nlp/NLPtutorial2020
 def train(self, path_corpus: str) -> Type["Bigram"]:
     self.model = self.__build_model(path_corpus)
     message(f"train model from {path_corpus}", type="success")
     return self
コード例 #16
0
            dis.dis(build_word_frequency_cnter, file=sys.stderr)
            out.header("doctest")
            doctest.testmod(verbose=True)
            out.header("check serialize")
            cnter = list_word_freq("../../test/00-input.txt")
            dump(cnter, "cnter")
            cnter = load("cnter")
        exit(0)

    with Renderer("単語の異なり数") as out:
        out.result("map", len(list_word_freq(path)))
        out.result("set", len(get_vocab(path)))

    num = 10
    with Renderer(f"数単語の頻度(上位 {num} 単語のみ)") as out:
        out.result(
            "大文字と小文字の区別をする",
            build_word_frequency_cnter(path, str).most_common(num),
        )
        trans = lambda w: w.lower()  # noqa: E731
        out.result(
            "大文字と小文字の区別をしない",
            build_word_frequency_cnter(path, trans).most_common(num),
        )

    if "test" in path:
        for k, v in list_word_freq(path):
            print(k, v, sep="\t")

    message("DONE.", type="status")
コード例 #17
0
    else:
        return f"({sym} {words[int(i)]})"


if __name__ == "__main__":
    if sys.argv[1] == "test":
        grammar_file = "../../test/08-grammar.txt"
        input_file = "../../test/08-input.txt"
    else:
        grammar_file = "../../data/wiki-en-test.grammar"
        input_file = "../../data/wiki-en-short.tok"

    s, t = 0, 1
    with Renderer(sys.argv[1]) as out:
        for i, s_expr in enumerate(cky(grammar_file, input_file, s=s, t=t)):
            message("=" * 3, "line:", s + i, "=" * 3)
            tree = Tree.fromstring(s_expr)
            out.result("S-expression", s_expr)
            out.result("nltk.tree.Tree", tree)
            out.header("nltk.tree.Tree.pretty_print")
            tree.pretty_print()
            # tree.draw()
"""result
[+] main
=== line: 0 ===
[*]  1. S-expression
(S (PP (IN Among) (NP (DT these) (NP' (, ,) (NP' (JJ supervised) (NP' (NN learning) (NNS approaches)))))) (S' (VP (VBP have) (VP (VBN been) (VP' (NP (DT the) (NP' (ADJP (RBS most) (JJ successful)) (NNS algorithms))) (PP (TO to) (NP_NN date))))) (. .)))
[*]  2. nltk.tree.Tree
(S
  (PP
    (IN Among)
コード例 #18
0
ファイル: knock23.py プロジェクト: tmu-nlp/100knock2020
def exec_match(wiki: str, pattern: str) -> Iterator[Tuple[str, Match]]:
    reg = re.compile(pattern)
    for line in wiki.split("\n"):
        match = reg.match(line)
        if match:
            yield line, match


if __name__ == "__main__":
    wiki = load("UK")

    pat = r"(?P<Level>=+)\s*(?P<Heading>.+)\s*(?P=Level)"
    for _, match in exec_match(wiki, pat):
        level, heading = match.group(1, 2)
        print(
            "  " * (len(level) - 2),
            "+",
            f" lv{len(level) - 1} ",
            heading,
            sep="",
        )

    with Renderer("re.match() vs. re.search()") as out:
        pat_hat = r"^" + pat
        it = zip(exec_match(wiki, pat), exec_search(wiki, pat_hat))
        for (line, match1), (_, match2) in it:
            assert match1.groups() == match2.groups(), line
        else:
            message("same")
コード例 #19
0
"""
57. 特徴量の重みの確認
52で学習したロジスティック回帰モデルの中で,
重みの高い特徴量トップ10と,重みの低い特徴量トップ10を確認せよ.

[MEMO]
2015 年版の knock75 に対応
"""
import os
import sys

from sklearn.metrics import precision_recall_fscore_support

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message, Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

if __name__ == "__main__":
    classifier = load("chap06-classifier")
    names = load("chap06-vectorizer-names")
    weights = classifier.coef_.flatten()
    ranking = sorted(zip(weights, names), reverse=True)
    with Renderer("knock57") as out:
        out.header("best 10")
        for weight, name in ranking[:10]:
            message(f"{name:15}{weight:f}")
        out.header("worst 10")
        for weight, name in ranking[:-11:-1]:
            message(f"{name:15}{weight:f}")
コード例 #20
0
ファイル: tutorial02.py プロジェクト: tmu-nlp/NLPtutorial2020
 def load(self, path_model: str) -> Type["Bigram"]:
     self.model = self.__load_model(path_model)
     message(f"load model from {path_model}", type="success")
     return self
コード例 #21
0
ファイル: knock29.py プロジェクト: tmu-nlp/100knock2020
def fetch_url_of_img_with_requests(filename: str) -> dict:
    message("fetch:", f"url of `{filename}`", type="status")
    with requests.Session() as s:
        return s.get(url=url, params=make_payload(filename)).json()
コード例 #22
0
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip
from kiyuna.utils.message import green  # noqa: E402 isort:skip


def remove_em(od: OrderedDict) -> OrderedDict:
    """remove emphasis expressions
        ''italics''
        '''bold'''
        '''''both'''''
    """
    res = OrderedDict()
    reg = re.compile(r"'{2,}")
    for key in od:
        res[key] = reg.sub("", od[key])
    return res


if __name__ == "__main__":
    infobox = load("infobox")
    res = remove_em(infobox)

    with Renderer("knock26") as out:
        for (key, src), (_, dst) in zip(infobox.items(), res.items()):
            if src == dst:
                out.cnt += 1
            else:
                out.result(key, (src, green(dst)))
        if infobox == res:
            message("変化なし", type="warning")
コード例 #23
0
ファイル: knock29.py プロジェクト: tmu-nlp/100knock2020
def save_file_from_url(url: str, filename: str) -> None:
    message("save :", filename, type="status")
    with urllib.request.urlopen(url) as f_in, open(filename, "wb") as f_out:
        f_out.write(f_in.read())
コード例 #24
0
 def __enter__(self) -> Type["SaveHelper"]:
     message("saving:", self.name, CR=True, type="status")
     return self
コード例 #25
0
def load(file_name: str) -> object:
    with open(get_path(file_name), "rb") as f_in:
        obj = dill.load(f_in)
    message("loaded:", trunc(repr(obj)), type="success")
    return obj
コード例 #26
0
def dump(obj: object, file_name: str) -> None:
    with open(get_path(file_name), "wb") as f_out:
        dill.dump(obj, f_out)
    message("saved :", trunc(repr(obj)), type="success")
コード例 #27
0
[MEMO]
2015 年版の knock94-95 に対応
"""
import os
import sys
from zipfile import ZipFile

from scipy.stats import spearmanr

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message, Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip

if __name__ == "__main__":
    wv = load("chap07-embeddings")
    preds, labels = [], []
    with ZipFile("wordsim353.zip") as myzip:
        message(myzip.infolist())
        with myzip.open("combined.csv") as myfile:
            myfile = map(lambda x: x.decode(), myfile)
            message("[header]", next(myfile))  # Word 1,Word 2,Human (mean)
            for line in myfile:
                word1, word2, human = line.split(",")
                preds.append(wv.similarity(word1, word2))
                labels.append(human)
    with Renderer("knock66") as out:
        out.result("Spearman corr", spearmanr(preds, labels)[0])
"""result
0.6849564489532376
"""
コード例 #28
0
ファイル: knock59.py プロジェクト: tmu-nlp/100knock2020
    #     "fit_intercept": [False, True],
    #     "class_weight": [None, "balanced"],
    #     "solver": ["newton-cg", "sag", "saga", "lbfgs"],
    #     "multi_class": ["multinomial"],
    #     "warm_start": [False, True],
    # }
    # for params in ParameterGrid(param_grid):
    #     clfs.append((LogisticRegression(**params), False))

    @timeout_decorator.timeout(3)
    def clf_fit(clf):
        clf.fit(*get_data("train", need_dense))

    models = defaultdict(list)
    for i, (clf, need_dense, *args) in enumerate(clfs):
        message(type(clf).__name__, type="status")
        message(clf.get_params())
        if args:
            message("skip", args, type="warning")
            continue
        if (clf.get_params().get("penalty", None) == "l1"
                and clf.get_params().get("solver", None) == "saga"):
            message("skip", "Too slow", type="warning")
            continue
        try:
            clf_fit(clf)
            score = clf.score(*get_data("valid", need_dense))
            models[score].append(clf)
            message(score, type="success")
        except Exception as e:
            message("skip", e, type="warning")
コード例 #29
0
カテゴリごとの性能をマイクロ平均(micro-average)とマクロ平均(macro-average)で統合せよ.

[MEMO]
2015 年版の knock77 に対応
"""
import os
import sys

from sklearn.metrics import precision_recall_fscore_support

from knock53 import load_dataset

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message, Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

if __name__ == "__main__":
    score_names = ["Precision", "Recall", "F1_score"]
    classifier = load("chap06-classifier")
    with Renderer("knock56") as out:
        for average in "micro", "macro":
            out.header(average)
            features, labels = load_dataset(f"./test.feature.txt")
            predicts = classifier.predict(features)
            for name, result in zip(
                    score_names,
                    precision_recall_fscore_support(labels,
                                                    predicts,
                                                    average=average)):
                message(f"{name:10}\t{result}")