Ejemplo n.º 1
0
def vectorize_init(labels, titles):

    encoder = LabelEncoder()
    encoder.fit(labels)
    labels_ = encoder.transform(labels)
    dump(encoder.classes_, "chap06-encoder-classes")  # <class 'numpy.ndarray'>

    vectorizer = TfidfVectorizer()
    vectorizer.fit(titles)
    features_ = vectorizer.transform(titles)
    dump(vectorizer.vocabulary_, "chap06-vectorizer-vocabs")  # <class 'dict'>
    dump(vectorizer.get_feature_names(), "chap06-vectorizer-names")  # [+] saved : chap06-model

    return features_, labels_
Ejemplo n.º 2
0
    wiki = load("UK")

    infobox = re.search(
        r"""
        ^{{基礎情報\s国
        (?P<Infobox_body>.+?)
        ^}}$
        """,
        wiki,
        flags=re.VERBOSE | re.DOTALL | re.MULTILINE,
    ).group("Infobox_body")
    reg = re.compile(r"(.+?)\s*=\s*(.+)", re.DOTALL)
    od = OrderedDict(
        reg.search(line.strip()).groups() for line in infobox.split("\n|")
        if line)
    dump(od, "infobox")

    with Renderer("knock25") as out:
        for k, v in od.items():
            out.result(k, green(v))

    assert od == OrderedDict(
        reg.search(line.strip()).groups()
        for line in extract_infobox(wiki).lstrip("基礎情報 国").split("\n|")
        if line)

    assert od == OrderedDict(
        re.findall(
            r"""
            \|                  # |
            (?P<Key>.+?)        # 略名
Ejemplo n.º 3
0
from kiyuna.utils.message import Renderer  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip


def vectorize_init(labels, titles):

    encoder = LabelEncoder()
    encoder.fit(labels)
    labels_ = encoder.transform(labels)
    dump(encoder.classes_, "chap06-encoder-classes")  # <class 'numpy.ndarray'>

    vectorizer = TfidfVectorizer()
    vectorizer.fit(titles)
    features_ = vectorizer.transform(titles)
    dump(vectorizer.vocabulary_, "chap06-vectorizer-vocabs")  # <class 'dict'>
    dump(vectorizer.get_feature_names(), "chap06-vectorizer-names")  # [+] saved : chap06-model

    return features_, labels_


def load_dataset(path):
    labels, titles = zip(*(line.strip().split("\t") for line in open(path)))
    return vectorize_init(labels, titles)


if __name__ == "__main__":
    features, labels = load_dataset("./train.feature.txt")
    classifier = LogisticRegression(multi_class="multinomial", solver="lbfgs", random_state=123)
    classifier.fit(features, labels)
    dump(classifier, "chap06-classifier")
Ejemplo n.º 4
0
def country_embeddings():
    wv = load("chap07-embeddings")
    country_names = np.array(list_country_names(), dtype=object)
    embeddings = [wv[country_name] for country_name in country_names]
    dump([embeddings, country_names], "chap07-embeddings-country")
    return embeddings, country_names
Ejemplo n.º 5
0
                country_names |= {country1, country2}
    return list(country_names)


def country_embeddings():
    wv = load("chap07-embeddings")
    country_names = np.array(list_country_names(), dtype=object)
    embeddings = [wv[country_name] for country_name in country_names]
    dump([embeddings, country_names], "chap07-embeddings-country")
    return embeddings, country_names


if __name__ == "__main__":
    embeddings, country_names = country_embeddings()
    kmeans = KMeans(n_clusters=5).fit(embeddings)
    dump(kmeans, "chap07-kmeans")
    with Renderer("knock67", start=0) as out:
        for i in range(5):
            out.result(f"Class {i}", country_names[kmeans.labels_ == i])
"""result
[*]  0. Class 0
['Bhutan' 'Bahrain' 'Japan' 'Morocco' 'Indonesia' 'Pakistan' 'Thailand'
 'Tunisia' 'Oman' 'Egypt' 'Turkey' 'Qatar' 'Iraq' 'Laos' 'Libya' 'Lebanon'
 'Jordan' 'Afghanistan' 'Bangladesh' 'Syria' 'Nepal' 'China' 'Vietnam'
 'Iran']
[*]  1. Class 1
['Samoa' 'Chile' 'Dominica' 'Australia' 'Ecuador' 'Fiji' 'Bahamas'
 'Canada' 'Jamaica' 'Nicaragua' 'Cuba' 'Peru' 'Venezuela' 'Uruguay'
 'Guyana' 'Honduras' 'Belize' 'Greenland' 'Philippines' 'Taiwan' 'Tuvalu'
 'Suriname']
[*]  2. Class 2
Ejemplo n.º 6
0
    if path == "MEMO":
        with Renderer("個人メモ") as out:
            out.result(
                "type hints",
                (
                    typing.get_type_hints(list_word_freq),
                    build_word_frequency_cnter.__annotations__,
                ),
            )
            out.header("with 内で return しても大丈夫なはず")
            dis.dis(build_word_frequency_cnter, file=sys.stderr)
            out.header("doctest")
            doctest.testmod(verbose=True)
            out.header("check serialize")
            cnter = list_word_freq("../../test/00-input.txt")
            dump(cnter, "cnter")
            cnter = load("cnter")
        exit(0)

    with Renderer("単語の異なり数") as out:
        out.result("map", len(list_word_freq(path)))
        out.result("set", len(get_vocab(path)))

    num = 10
    with Renderer(f"数単語の頻度(上位 {num} 単語のみ)") as out:
        out.result(
            "大文字と小文字の区別をする",
            build_word_frequency_cnter(path, str).most_common(num),
        )
        trans = lambda w: w.lower()  # noqa: E731
        out.result(
Ejemplo n.º 7
0
[MEMO]
2015 年版の knock86 に対応
"""
import os
import sys

from gensim.models import KeyedVectors

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.pickle import dump, load  # noqa: E402 isort:skip

if __name__ == "__main__":
    if not os.path.exists("../pickles/chap07-embeddings.pkl"):
        path = "./GoogleNews-vectors-negative300.bin.gz"
        embeddings = KeyedVectors.load_word2vec_format(path, binary=True)
        dump(embeddings, "chap07-embeddings")

    wv = load("chap07-embeddings")
    print(wv["United_States"])
"""result
[-3.61328125e-02 -4.83398438e-02  2.35351562e-01  1.74804688e-01
 -1.46484375e-01 -7.42187500e-02 -1.01562500e-01 -7.71484375e-02
  1.09375000e-01 -5.71289062e-02 -1.48437500e-01 -6.00585938e-02
  1.74804688e-01 -7.71484375e-02  2.58789062e-02 -7.66601562e-02
 -3.80859375e-02  1.35742188e-01  3.75976562e-02 -4.19921875e-02
 -3.56445312e-02  5.34667969e-02  3.68118286e-04 -1.66992188e-01
 -1.17187500e-01  1.41601562e-01 -1.69921875e-01 -6.49414062e-02
 -1.66992188e-01  1.00585938e-01  1.15722656e-01 -2.18750000e-01
 -9.86328125e-02 -2.56347656e-02  1.23046875e-01 -3.54003906e-02
 -1.58203125e-01 -1.60156250e-01  2.94189453e-02  8.15429688e-02
  6.88476562e-02  1.87500000e-01  6.49414062e-02  1.15234375e-01
Ejemplo n.º 8
0
"""
import gzip
import json
import os
import pprint
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from kiyuna.utils.message import Renderer, message  # noqa: E402 isort:skip
from kiyuna.utils.pickle import dump  # noqa: E402 isort:skip


def extract_wiki(path: str, title: str) -> dict:
    with gzip.open(path, "rt") as f:
        for line in f:
            d = json.loads(line)
            if d["title"] == title:
                return d


if __name__ == "__main__":
    input_path, title, out_fname, *_ = sys.argv[1:]

    article = extract_wiki(input_path, title)

    dump(article["text"], out_fname)

    with Renderer("MEMO") as out:
        out.result("article.keys()", article.keys())
        # pprint.pprint(article)