Esempio n. 1
0
def render_html(path_img, path_html="out29.html") -> None:
    message("save :", path_html, type="status")
    contents = ("<!DOCTYPE html><html>"
                "<head><title>knock29</title></head>"
                '<body><img src="%s" width="128"/></body>'
                "</html>" % path_img)
    with open(path_html, "w") as f:
        f.write(contents)

    message("open :", path_html, type="status")
    webbrowser.open(path_html)
Esempio n. 2
0
 def tag(self, path_input: str, path_output: str, **kwargs) -> None:
     self.params.update(**kwargs)
     res = []
     with open(path_input) as f_in:
         for line in tqdm.tqdm(f_in):
             words = [self.trans(word) for word in line.split()]
             best_edge = self.__forward(words)
             tags = self.__backward(words, best_edge)
             res.append(" ".join(tags) + "\n")
     with open(path_output, "w") as f_out:
         f_out.writelines(res)
     message(f"saved : {path_output}", type="success")
Esempio n. 3
0
def sample(test_path, epochs=1, α=0.01, β=0.01, num_topics=2):
    """ #09 p24 """
    xcorpus, ycorpus, xcounts, ycounts, wordtype = initialize(test_path, num_topics)
    for epoch in range(1, epochs + 1):
        message("epoch =", epoch, type="status")
        ll = 0
        for i in tqdm(range(len(xcorpus)), leave=False):
            for j in range(len(xcorpus[i])):
                x = xcorpus[i][j]
                y = ycorpus[i][j]
                add_counts(xcounts, ycounts, x, y, i, -1)
                probs = []
                for k in range(num_topics):
                    p_xk = (xcounts[f"{x}|{k}"] + α) / (xcounts[k] + α * wordtype)
                    p_ky = (ycounts[f"{k}|{i}"] + β) / (ycounts[i] + β * num_topics)
                    probs.append(p_xk * p_ky)
                new_y = sampleone(probs)
                ll += math.log(probs[new_y])
                add_counts(xcounts, ycounts, x, new_y, i, 1)
                ycorpus[i][j] = new_y
        message("ll =", ll, type="success")
    return xcorpus, ycorpus
Esempio n. 4
0
def test(args: argparse.Namespace) -> None:
    model = BigramLM(WittenBell=args.WittenBell).load(args.model)

    res = model.test(args.test)
    if args.name:
        message(
            f"[{args.name} | {get_ext(args.WittenBell)}"
            f" default(λ_1={0.95:.2f}, λ_2={0.95:.2f})]",
            file=sys.stdout,
        )
    for k, v in res.items():
        message(f"{k:15s} = {v:f}", file=sys.stdout)

    # λ_1, λ_2 = grid_search(
    #     model,
    #     args.test,
    #     rng=(0.01, 1, 0.01),
    #     save=f"result_{get_ext(args.WittenBell)}.png",
    # )
    λ_1, λ_2 = grid_search(
        model,
        args.test,
        rng=(0.05, 1, 0.05),
        save=f"fig1_{get_ext(args.WittenBell)}.png",
    )
    # λ_1, λ_2 = grid_search(
    #     model,
    #     args.test,
    #     rng1=(λ_1 - 0.1, λ_1 + 0.1, 0.01),
    #     rng2=(λ_2 - 0.1, λ_2 + 0.1, 0.01),
    #     save=f"fig2_{get_ext(args.WittenBell)}.png",
    # )

    res = model.test(args.test, λ_1=λ_1, λ_2=λ_2)
    if args.name:
        message(
            f"[{args.name} | {get_ext(args.WittenBell)}"
            f" optimized(λ_1={λ_1:.2f}, λ_2={λ_2:.2f})]",
            file=sys.stdout,
        )
    for k, v in res.items():
        message(f"{k:15s} = {v:f}", file=sys.stdout)
    """result
Esempio n. 5
0
class Tokenizer(UnigramLM):
    def tokenize(
        self,
        path_input: str,
        path_output: str,
        *,
        λ_1: float = 0.95,
        vocab_size: int = 1_000_000,
    ) -> None:
        def forward(line: str, V: int = vocab_size) -> List[Tuple[int, int]]:
            size = len(line)
            best_edge = [None] * (size + 1)
            best_score = [float("inf")] * (size + 1)
            best_score[0] = 0
            for word_end in range(1, size + 1):
                for word_begin in range(size):
                    word = line[word_begin:word_end]
                    if word in self.model or len(word) == 1:
                        prob = λ_1 * self.model.get(word, 0) + (1 - λ_1) / V
                        my_score = best_score[word_begin] + -math.log2(prob)
                        if my_score < best_score[word_end]:
                            best_score[word_end] = my_score
                            best_edge[word_end] = (word_begin, word_end)
            return best_edge

        def backward(line: str, best_edge: List[Tuple[int, int]]) -> List[str]:
            words = []
            next_edge = best_edge[-1]
            while next_edge:
                words.append(line[next_edge[0]:next_edge[1]])
                next_edge = best_edge[next_edge[0]]
            words.reverse()
            return words

        res = []
        with open(path_input) as f_in:
            for line in map(lambda x: x.strip(), f_in):
                best_edge = forward(line)
                words = backward(line, best_edge)
                res.append(" ".join(words) + "\n")
        with open(path_output, "w") as f_out:
            f_out.writelines(res)
        message(f"saved {path_output}", type="success")
Esempio n. 6
0
def test(args: argparse.Namespace) -> None:
    model = UnigramLM().load(args.model)

    res = model.test(args.test)
    if args.name:
        message(f"[{args.name} | default(λ_1={0.95:.2f})]", file=sys.stdout)
    for k, v in res.items():
        message(f"{k:15s} = {v:f}", file=sys.stdout)

    entropy, λ_1 = min(
        (model.test(args.test, λ_unk=1 - λ_1)["entropy_H"], λ_1)
        for λ_1 in np.arange(0, 1, 0.01)
    )

    res = model.test(args.test, λ_unk=1 - λ_1)
    if args.name:
        message(
            f"[{args.name} | optimized(λ_1={λ_1:.2f})]", file=sys.stdout,
        )
    for k, v in res.items():
        message(f"{k:15s} = {v:f}", file=sys.stdout)

    """result
Esempio n. 7
0
def grid_search(
    model: Model,
    path_test: str,
    *,
    rng: Optional[Tuple[float, float, float]] = None,
    rng1: Optional[Tuple[float, float, float]] = None,
    rng2: Optional[Tuple[float, float, float]] = None,
    save: Optional[str] = None,
) -> Tuple[float, float]:
    def get_param(idx: np.ndarray) -> np.ndarray:
        return (np.array([rng1[0], rng2[0]]) +
                np.array([rng1[2], rng2[2]]) * idx)

    if rng:
        rng1 = rng2 = rng
    assert rng1 is not None
    assert rng2 is not None

    with Renderer("grid search") as out:
        cnt1 = len(np.arange(*rng1))
        cnt2 = len(np.arange(*rng2))
        E = np.zeros((cnt2, cnt1))
        for j, λ_2 in enumerate(np.arange(*rng2)):
            message(f"{j + 1:2d} / {cnt2}", CR=True, type="status")
            for i, λ_1 in enumerate(np.arange(*rng1)):
                E[j, i] = model.test(path_test, λ_1=λ_1, λ_2=λ_2)["entropy_H"]
        message("", CR=True)

        ma_y, ma_x = np.where(E == E.max())
        mi_y, mi_x = np.where(E == E.min())
        out.result("max", (E.max(), get_param(np.hstack([ma_x, ma_y]))))
        out.result("min", (E.min(), get_param(np.hstack([mi_x, mi_y]))))

    if save:
        fig = plt.figure()
        ax = fig.add_subplot(111)

        mappable = ax.pcolor(E, cmap="jet", edgecolors="k", alpha=0.8)
        fig.colorbar(mappable)

        ax.scatter(ma_x + 0.5, ma_y + 0.5, c="r", label="max")
        ax.scatter(mi_x + 0.5, mi_y + 0.5, c="b", label="min")

        ax.set_xticks(np.arange(cnt1) + 0.5, minor=False)
        ax.set_yticks(np.arange(cnt2) + 0.5, minor=False)
        ax.set_xticklabels(
            map(lambda x: f"{x:.2f}"[1:], np.arange(*rng1)),
            minor=False,
            rotation=45,
        )
        ax.set_yticklabels(
            map(lambda x: f"{x:.2f}"[1:], np.arange(*rng2)),
            minor=False,
        )
        ax.set_title(f"エントロピー {get_ext(model.WittenBell)}")
        ax.set_xlabel("$λ_1$")
        ax.set_ylabel("$λ_2$")
        ax.set_aspect("equal")
        ax.legend(loc="lower right")
        plt.savefig(save)

    return get_param(np.hstack([mi_x, mi_y]))
Esempio n. 8
0
def fetch_url_of_img_with_urllib(filename: str) -> dict:
    message("fetch:", f"url of `{filename}`", type="status")
    url_ = url + "?%s" % urllib.parse.urlencode(make_payload(filename))
    with urllib.request.urlopen(url_) as f:
        return json.loads(f.read().decode("utf-8"))
Esempio n. 9
0
 def train(self, path_corpus: str) -> "POSTagger":
     self.probs, self.possible_tags = self.__build_model(path_corpus)
     message(f"train model from {path_corpus}", type="success")
     return self
Esempio n. 10
0
 def load(self, path_model: str) -> "POSTagger":
     self.probs, self.possible_tags = self.__load_model(path_model)
     message(f"load  model from {path_model}", type="success")
     return self
Esempio n. 11
0
 def __exit__(self, *args) -> None:
     message("saved :", self.name, "\n", CR=True, type="success")
Esempio n. 12
0
    for k, v in cnter.items():
        idx = v.index(max(v))
        groups[idx].append((v[idx], k))
        tmp.append((v[idx], k))
    print_cnt = min(300, len(tmp))
    th = sorted(tmp, reverse=True)[print_cnt - 1][0]
    for i, group in enumerate(groups):
        print("=" * 5, i, "=" * 5)
        group.sort(reverse=True)
        res = [word for freq, word in group if word not in stop_words and freq >= th]
        pprint(res, width=80, compact=True)


if __name__ == "__main__":
    if sys.argv[1:] == ["test"]:
        message("test", type="status")
        learn_lda(test_path="../../test/07-train.txt", epochs=50)
    else:
        message("main", type="status")
        stop_words = nltk.corpus.stopwords.words("english")
        symbols = [
            "'",
            '"',
            ":",
            ";",
            ".",
            ",",
            "-",
            "!",
            "?",
            ")",
Esempio n. 13
0
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message  # noqa: E402 isort:skip


class ChunkNormalized(Chunk):
    def __init__(self, chunk):
        self.morphs, self.dst, self.srcs = (*chunk, )
        self.norm = self.get_norm()

    def get_norm(self):
        clause = "".join(m.surface for m in self.morphs if m.pos != "記号")
        return clause

    def has_pos(self, pos):
        return any(m.pos == pos for m in self.morphs)


if __name__ == "__main__":
    res = []
    for chunks in cabocha_into_chunks():
        chunks = {k: ChunkNormalized(v) for k, v in chunks.items()}
        for c in chunks.values():
            if c.dst == -1:
                continue
            if c.dst not in chunks:
                continue
            if c.has_pos("名詞") and chunks[c.dst].has_pos("動詞"):
                res.append(f"{c.norm}\t{chunks[c.dst].norm}\n")
    sys.stdout.writelines(res)
    message(f"write {len(res)} lines", type="success")
Esempio n. 14
0
 def __exit__(self, *args) -> None:
     self.end = time.time()
     self.secs = self.end - self.start
     self.msecs = self.secs * 1000
     if self.verbose:
         message(f"elapsed time = {self.msecs:f} [msec]", type="success")
Esempio n. 15
0
 def train(self, path_corpus: str) -> Type["Bigram"]:
     self.model = self.__build_model(path_corpus)
     message(f"train model from {path_corpus}", type="success")
     return self
Esempio n. 16
0
            dis.dis(build_word_frequency_cnter, file=sys.stderr)
            out.header("doctest")
            doctest.testmod(verbose=True)
            out.header("check serialize")
            cnter = list_word_freq("../../test/00-input.txt")
            dump(cnter, "cnter")
            cnter = load("cnter")
        exit(0)

    with Renderer("単語の異なり数") as out:
        out.result("map", len(list_word_freq(path)))
        out.result("set", len(get_vocab(path)))

    num = 10
    with Renderer(f"数単語の頻度(上位 {num} 単語のみ)") as out:
        out.result(
            "大文字と小文字の区別をする",
            build_word_frequency_cnter(path, str).most_common(num),
        )
        trans = lambda w: w.lower()  # noqa: E731
        out.result(
            "大文字と小文字の区別をしない",
            build_word_frequency_cnter(path, trans).most_common(num),
        )

    if "test" in path:
        for k, v in list_word_freq(path):
            print(k, v, sep="\t")

    message("DONE.", type="status")
Esempio n. 17
0
    else:
        return f"({sym} {words[int(i)]})"


if __name__ == "__main__":
    if sys.argv[1] == "test":
        grammar_file = "../../test/08-grammar.txt"
        input_file = "../../test/08-input.txt"
    else:
        grammar_file = "../../data/wiki-en-test.grammar"
        input_file = "../../data/wiki-en-short.tok"

    s, t = 0, 1
    with Renderer(sys.argv[1]) as out:
        for i, s_expr in enumerate(cky(grammar_file, input_file, s=s, t=t)):
            message("=" * 3, "line:", s + i, "=" * 3)
            tree = Tree.fromstring(s_expr)
            out.result("S-expression", s_expr)
            out.result("nltk.tree.Tree", tree)
            out.header("nltk.tree.Tree.pretty_print")
            tree.pretty_print()
            # tree.draw()
"""result
[+] main
=== line: 0 ===
[*]  1. S-expression
(S (PP (IN Among) (NP (DT these) (NP' (, ,) (NP' (JJ supervised) (NP' (NN learning) (NNS approaches)))))) (S' (VP (VBP have) (VP (VBN been) (VP' (NP (DT the) (NP' (ADJP (RBS most) (JJ successful)) (NNS algorithms))) (PP (TO to) (NP_NN date))))) (. .)))
[*]  2. nltk.tree.Tree
(S
  (PP
    (IN Among)
Esempio n. 18
0
def exec_match(wiki: str, pattern: str) -> Iterator[Tuple[str, Match]]:
    reg = re.compile(pattern)
    for line in wiki.split("\n"):
        match = reg.match(line)
        if match:
            yield line, match


if __name__ == "__main__":
    wiki = load("UK")

    pat = r"(?P<Level>=+)\s*(?P<Heading>.+)\s*(?P=Level)"
    for _, match in exec_match(wiki, pat):
        level, heading = match.group(1, 2)
        print(
            "  " * (len(level) - 2),
            "+",
            f" lv{len(level) - 1} ",
            heading,
            sep="",
        )

    with Renderer("re.match() vs. re.search()") as out:
        pat_hat = r"^" + pat
        it = zip(exec_match(wiki, pat), exec_search(wiki, pat_hat))
        for (line, match1), (_, match2) in it:
            assert match1.groups() == match2.groups(), line
        else:
            message("same")
Esempio n. 19
0
"""
57. 特徴量の重みの確認
52で学習したロジスティック回帰モデルの中で,
重みの高い特徴量トップ10と,重みの低い特徴量トップ10を確認せよ.

[MEMO]
2015 年版の knock75 に対応
"""
import os
import sys

from sklearn.metrics import precision_recall_fscore_support

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message, Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

if __name__ == "__main__":
    classifier = load("chap06-classifier")
    names = load("chap06-vectorizer-names")
    weights = classifier.coef_.flatten()
    ranking = sorted(zip(weights, names), reverse=True)
    with Renderer("knock57") as out:
        out.header("best 10")
        for weight, name in ranking[:10]:
            message(f"{name:15}{weight:f}")
        out.header("worst 10")
        for weight, name in ranking[:-11:-1]:
            message(f"{name:15}{weight:f}")
Esempio n. 20
0
 def load(self, path_model: str) -> Type["Bigram"]:
     self.model = self.__load_model(path_model)
     message(f"load model from {path_model}", type="success")
     return self
Esempio n. 21
0
def fetch_url_of_img_with_requests(filename: str) -> dict:
    message("fetch:", f"url of `{filename}`", type="status")
    with requests.Session() as s:
        return s.get(url=url, params=make_payload(filename)).json()
Esempio n. 22
0
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip
from kiyuna.utils.message import green  # noqa: E402 isort:skip


def remove_em(od: OrderedDict) -> OrderedDict:
    """remove emphasis expressions
        ''italics''
        '''bold'''
        '''''both'''''
    """
    res = OrderedDict()
    reg = re.compile(r"'{2,}")
    for key in od:
        res[key] = reg.sub("", od[key])
    return res


if __name__ == "__main__":
    infobox = load("infobox")
    res = remove_em(infobox)

    with Renderer("knock26") as out:
        for (key, src), (_, dst) in zip(infobox.items(), res.items()):
            if src == dst:
                out.cnt += 1
            else:
                out.result(key, (src, green(dst)))
        if infobox == res:
            message("変化なし", type="warning")
Esempio n. 23
0
def save_file_from_url(url: str, filename: str) -> None:
    message("save :", filename, type="status")
    with urllib.request.urlopen(url) as f_in, open(filename, "wb") as f_out:
        f_out.write(f_in.read())
Esempio n. 24
0
 def __enter__(self) -> Type["SaveHelper"]:
     message("saving:", self.name, CR=True, type="status")
     return self
Esempio n. 25
0
def load(file_name: str) -> object:
    with open(get_path(file_name), "rb") as f_in:
        obj = dill.load(f_in)
    message("loaded:", trunc(repr(obj)), type="success")
    return obj
Esempio n. 26
0
def dump(obj: object, file_name: str) -> None:
    with open(get_path(file_name), "wb") as f_out:
        dill.dump(obj, f_out)
    message("saved :", trunc(repr(obj)), type="success")
Esempio n. 27
0
[MEMO]
2015 年版の knock94-95 に対応
"""
import os
import sys
from zipfile import ZipFile

from scipy.stats import spearmanr

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message, Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import load  # noqa: E402 isort:skip

if __name__ == "__main__":
    wv = load("chap07-embeddings")
    preds, labels = [], []
    with ZipFile("wordsim353.zip") as myzip:
        message(myzip.infolist())
        with myzip.open("combined.csv") as myfile:
            myfile = map(lambda x: x.decode(), myfile)
            message("[header]", next(myfile))  # Word 1,Word 2,Human (mean)
            for line in myfile:
                word1, word2, human = line.split(",")
                preds.append(wv.similarity(word1, word2))
                labels.append(human)
    with Renderer("knock66") as out:
        out.result("Spearman corr", spearmanr(preds, labels)[0])
"""result
0.6849564489532376
"""
Esempio n. 28
0
    #     "fit_intercept": [False, True],
    #     "class_weight": [None, "balanced"],
    #     "solver": ["newton-cg", "sag", "saga", "lbfgs"],
    #     "multi_class": ["multinomial"],
    #     "warm_start": [False, True],
    # }
    # for params in ParameterGrid(param_grid):
    #     clfs.append((LogisticRegression(**params), False))

    @timeout_decorator.timeout(3)
    def clf_fit(clf):
        clf.fit(*get_data("train", need_dense))

    models = defaultdict(list)
    for i, (clf, need_dense, *args) in enumerate(clfs):
        message(type(clf).__name__, type="status")
        message(clf.get_params())
        if args:
            message("skip", args, type="warning")
            continue
        if (clf.get_params().get("penalty", None) == "l1"
                and clf.get_params().get("solver", None) == "saga"):
            message("skip", "Too slow", type="warning")
            continue
        try:
            clf_fit(clf)
            score = clf.score(*get_data("valid", need_dense))
            models[score].append(clf)
            message(score, type="success")
        except Exception as e:
            message("skip", e, type="warning")
Esempio n. 29
0
カテゴリごとの性能をマイクロ平均(micro-average)とマクロ平均(macro-average)で統合せよ.

[MEMO]
2015 年版の knock77 に対応
"""
import os
import sys

from sklearn.metrics import precision_recall_fscore_support

from knock53 import load_dataset

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import message, Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

if __name__ == "__main__":
    score_names = ["Precision", "Recall", "F1_score"]
    classifier = load("chap06-classifier")
    with Renderer("knock56") as out:
        for average in "micro", "macro":
            out.header(average)
            features, labels = load_dataset(f"./test.feature.txt")
            predicts = classifier.predict(features)
            for name, result in zip(
                    score_names,
                    precision_recall_fscore_support(labels,
                                                    predicts,
                                                    average=average)):
                message(f"{name:10}\t{result}")