def vectorize_init(labels, titles): encoder = LabelEncoder() encoder.fit(labels) labels_ = encoder.transform(labels) dump(encoder.classes_, "chap06-encoder-classes") # <class 'numpy.ndarray'> vectorizer = TfidfVectorizer() vectorizer.fit(titles) features_ = vectorizer.transform(titles) dump(vectorizer.vocabulary_, "chap06-vectorizer-vocabs") # <class 'dict'> dump(vectorizer.get_feature_names(), "chap06-vectorizer-names") # [+] saved : chap06-model return features_, labels_
wiki = load("UK") infobox = re.search( r""" ^{{基礎情報\s国 (?P<Infobox_body>.+?) ^}}$ """, wiki, flags=re.VERBOSE | re.DOTALL | re.MULTILINE, ).group("Infobox_body") reg = re.compile(r"(.+?)\s*=\s*(.+)", re.DOTALL) od = OrderedDict( reg.search(line.strip()).groups() for line in infobox.split("\n|") if line) dump(od, "infobox") with Renderer("knock25") as out: for k, v in od.items(): out.result(k, green(v)) assert od == OrderedDict( reg.search(line.strip()).groups() for line in extract_infobox(wiki).lstrip("基礎情報 国").split("\n|") if line) assert od == OrderedDict( re.findall( r""" \| # | (?P<Key>.+?) # 略名
from kiyuna.utils.message import Renderer # noqa: E402 isort:skip from kiyuna.utils.pickle import dump, load # noqa: E402 isort:skip def vectorize_init(labels, titles): encoder = LabelEncoder() encoder.fit(labels) labels_ = encoder.transform(labels) dump(encoder.classes_, "chap06-encoder-classes") # <class 'numpy.ndarray'> vectorizer = TfidfVectorizer() vectorizer.fit(titles) features_ = vectorizer.transform(titles) dump(vectorizer.vocabulary_, "chap06-vectorizer-vocabs") # <class 'dict'> dump(vectorizer.get_feature_names(), "chap06-vectorizer-names") # [+] saved : chap06-model return features_, labels_ def load_dataset(path): labels, titles = zip(*(line.strip().split("\t") for line in open(path))) return vectorize_init(labels, titles) if __name__ == "__main__": features, labels = load_dataset("./train.feature.txt") classifier = LogisticRegression(multi_class="multinomial", solver="lbfgs", random_state=123) classifier.fit(features, labels) dump(classifier, "chap06-classifier")
def country_embeddings(): wv = load("chap07-embeddings") country_names = np.array(list_country_names(), dtype=object) embeddings = [wv[country_name] for country_name in country_names] dump([embeddings, country_names], "chap07-embeddings-country") return embeddings, country_names
country_names |= {country1, country2} return list(country_names) def country_embeddings(): wv = load("chap07-embeddings") country_names = np.array(list_country_names(), dtype=object) embeddings = [wv[country_name] for country_name in country_names] dump([embeddings, country_names], "chap07-embeddings-country") return embeddings, country_names if __name__ == "__main__": embeddings, country_names = country_embeddings() kmeans = KMeans(n_clusters=5).fit(embeddings) dump(kmeans, "chap07-kmeans") with Renderer("knock67", start=0) as out: for i in range(5): out.result(f"Class {i}", country_names[kmeans.labels_ == i]) """result [*] 0. Class 0 ['Bhutan' 'Bahrain' 'Japan' 'Morocco' 'Indonesia' 'Pakistan' 'Thailand' 'Tunisia' 'Oman' 'Egypt' 'Turkey' 'Qatar' 'Iraq' 'Laos' 'Libya' 'Lebanon' 'Jordan' 'Afghanistan' 'Bangladesh' 'Syria' 'Nepal' 'China' 'Vietnam' 'Iran'] [*] 1. Class 1 ['Samoa' 'Chile' 'Dominica' 'Australia' 'Ecuador' 'Fiji' 'Bahamas' 'Canada' 'Jamaica' 'Nicaragua' 'Cuba' 'Peru' 'Venezuela' 'Uruguay' 'Guyana' 'Honduras' 'Belize' 'Greenland' 'Philippines' 'Taiwan' 'Tuvalu' 'Suriname'] [*] 2. Class 2
if path == "MEMO": with Renderer("個人メモ") as out: out.result( "type hints", ( typing.get_type_hints(list_word_freq), build_word_frequency_cnter.__annotations__, ), ) out.header("with 内で return しても大丈夫なはず") dis.dis(build_word_frequency_cnter, file=sys.stderr) out.header("doctest") doctest.testmod(verbose=True) out.header("check serialize") cnter = list_word_freq("../../test/00-input.txt") dump(cnter, "cnter") cnter = load("cnter") exit(0) with Renderer("単語の異なり数") as out: out.result("map", len(list_word_freq(path))) out.result("set", len(get_vocab(path))) num = 10 with Renderer(f"数単語の頻度(上位 {num} 単語のみ)") as out: out.result( "大文字と小文字の区別をする", build_word_frequency_cnter(path, str).most_common(num), ) trans = lambda w: w.lower() # noqa: E731 out.result(
[MEMO] 2015 年版の knock86 に対応 """ import os import sys from gensim.models import KeyedVectors sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.pickle import dump, load # noqa: E402 isort:skip if __name__ == "__main__": if not os.path.exists("../pickles/chap07-embeddings.pkl"): path = "./GoogleNews-vectors-negative300.bin.gz" embeddings = KeyedVectors.load_word2vec_format(path, binary=True) dump(embeddings, "chap07-embeddings") wv = load("chap07-embeddings") print(wv["United_States"]) """result [-3.61328125e-02 -4.83398438e-02 2.35351562e-01 1.74804688e-01 -1.46484375e-01 -7.42187500e-02 -1.01562500e-01 -7.71484375e-02 1.09375000e-01 -5.71289062e-02 -1.48437500e-01 -6.00585938e-02 1.74804688e-01 -7.71484375e-02 2.58789062e-02 -7.66601562e-02 -3.80859375e-02 1.35742188e-01 3.75976562e-02 -4.19921875e-02 -3.56445312e-02 5.34667969e-02 3.68118286e-04 -1.66992188e-01 -1.17187500e-01 1.41601562e-01 -1.69921875e-01 -6.49414062e-02 -1.66992188e-01 1.00585938e-01 1.15722656e-01 -2.18750000e-01 -9.86328125e-02 -2.56347656e-02 1.23046875e-01 -3.54003906e-02 -1.58203125e-01 -1.60156250e-01 2.94189453e-02 8.15429688e-02 6.88476562e-02 1.87500000e-01 6.49414062e-02 1.15234375e-01
""" import gzip import json import os import pprint import sys sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) from kiyuna.utils.message import Renderer, message # noqa: E402 isort:skip from kiyuna.utils.pickle import dump # noqa: E402 isort:skip def extract_wiki(path: str, title: str) -> dict: with gzip.open(path, "rt") as f: for line in f: d = json.loads(line) if d["title"] == title: return d if __name__ == "__main__": input_path, title, out_fname, *_ = sys.argv[1:] article = extract_wiki(input_path, title) dump(article["text"], out_fname) with Renderer("MEMO") as out: out.result("article.keys()", article.keys()) # pprint.pprint(article)