def train() -> None: # 素性とラベルを読み込む features = deserialize("features") labels = deserialize("labels") # ロジスティック回帰モデル重みを学習させる (fit) model = LogisticRegression().fit(features, labels) serialize("model", model)
def main(): # 国名に関するベクトルの読み込み features = deserialize("country.matrix") t_index = deserialize("country.index") # ward 法 df = pd.DataFrame(features, t_index.keys()) la = linkage(df, method="ward", metric="euclidean") # デンドログラムの表示 dendrogram(la, labels=list(t_index.keys()), leaf_font_size=8) plt.show()
def main(): # 国名に関するベクトルの読み込み features = deserialize("country.matrix") t_index = deserialize("country.index") # k=5 でクラスタリング kmeans_model = KMeans(n_clusters=5).fit(features) labels = kmeans_model.labels_ t_keys = t_index.keys() for label, t in zip(labels, t_keys): print(f"{t.ljust(12)} : {label}")
def main() -> None: # モデル, 素性, 正解ラベルの読み込み model = deserialize("model") features = deserialize("features") labels = deserialize("labels") # 予測されたラベルと予測確率を取得 preds = model.predict(features) probs = model.predict_proba(features) # 正解ラベル 予測ラベル 予測確率 を出力 for ans, label, prob in zip(labels, preds, probs): print(f"{ans}\t{label}\t{max(prob):.6f}")
def main(): model = deserialize("model") features = deserialize("features") labels = deserialize("labels") # +1 に分類される確率 probs = model.predict_proba(features)[:, 1] # 正解ラベルと予測したラベルの確率を与える pre, rec, th = precision_recall_curve(labels, probs) plt.plot(rec, pre) plt.xlabel("recall") plt.ylabel("precision") plt.show()
def main() -> None: # model と vocal の読み込み model = deserialize("model") vocab = deserialize("vocabs") # sample.txt から素性を抽出し、ベクトル化する sentences, _ = create_feature("./sample.txt") vectorizer = TfidfVectorizer(vocabulary=vocab) feature = vectorizer.fit_transform(sentences).toarray() # model.predict : データが分類されるクラスを予測 # model.predict_proba : データが各クラスに分類される確率を求める pp = zip(model.predict(feature), model.predict_proba(feature)) for predict, prob in pp: print(f"{int(predict):>3} : {max(prob):.6}")
def main(): # ベクトルの読み込み t_index = deserialize("country.index") matrix = np.array(deserialize("country.matrix")) t_sne = TSNE(perplexity=30, learning_rate=500).fit_transform(matrix) predicts = KMeans(n_clusters=5).fit_predict(matrix) fig, ax = plt.subplots() cmap = plt.get_cmap("Set1") for index, label in enumerate(t_index.keys()): cval = cmap(predicts[index] / 4) ax.scatter(t_sne[index, 0], t_sne[index, 1], marker=".", color=cval) ax.annotate(label, xy=(t_sne[index, 0], t_sne[index, 1]), color=cval) plt.show()
def main() -> None: model = deserialize("model") features = deserialize("features") labels = deserialize("labels") # 正解率,適合率,再現率,F1スコア eval_index = ["accuracy", "precision", "recall", "f1"] # 5 分割交差検定を行い、各指標の算術平均を取る scores = cross_validate(model, features, labels, cv=5, scoring=eval_index) scores = {k: mean(v) for k, v in scores.items()} print(f"正解率 : {scores['test_accuracy']}") print(f"適合率 : {scores['test_precision']}") print(f"再現率 : {scores['test_recall']}") print(f"F1 : {scores['test_f1']}")
def weight_rank() -> None: model = deserialize("model") names = deserialize("names") # モデルの重み weights = model.coef_[0].tolist() # モデルの重みの値に名前を対応させる res = list(zip(weights, names)) res.sort() print("\n# rank : worst 10") for pair in res[:10]: print(f"{pair[1]:<10}{pair[0]:.6f}") print("\n# rank : top 10") for pair in res[:-11:-1]: print(f"{pair[1]:<10} {pair[0]:.6f}")
def main(): # word2vec のすべてのベクトルを読み込む matrix, t_index = deserialize("matrix"), deserialize("t_index") # 国名のリストの読み込み with open("../data/country.json", "r", encoding="utf-8") as f: data = json.load(f) countries = [d["name"] for d in data] new_matrix, new_index = [], {} for c in countries: if not c in t_index: continue new_matrix.append(matrix[t_index[c]]) new_index[c] = t_index[c] serialize("country.matrix", new_matrix) serialize("country.index", new_index)
def main(): word2vec_filepath = "../data/w2v.txt" load_word2vec(word2vec_filepath) matrix, t_index = deserialize("matrix"), deserialize("t_index") # knock86 u_s = matrix[t_index["United_States"]] # knock87 print(cosine_similarity(matrix[t_index["U.S"]], u_s)) # knock88 for rank in similar_list(matrix, t_index, matrix[t_index["England"]])[1:11]: print(f"{rank[0].ljust(10)} : {rank[1]}") # knock89 for rank in multi_vec(matrix, t_index, "Spain", "Madrid", "Athens")[1:11]: print(f"{rank[0].ljust(10)} : {rank[1]}")
import numpy as np import sys, pathlib from knock90 import multi_vec from scipy.io import loadmat from tqdm import tqdm import faiss chap08 = pathlib.Path().parent / ".." / "chapter08" chap09 = pathlib.Path().parent / ".." / "chapter09" sys.path.extend([str(chap08), str(chap09)]) from knock72 import serialize, deserialize from knock87 import cosine_similarity from knock80 import file_reader ppmi, p_index = loadmat("knock85.matrix")["knock85.matrix"], deserialize( "p_index") p_keys = list(p_index.keys()) w2v, t_index = np.array(deserialize("matrix")), deserialize("t_index") t_keys = list(t_index.keys()) faiss_ppmi = faiss.IndexFlatIP(300) faiss_ppmi.add(np.ascontiguousarray(ppmi.astype("float32"))) faiss_w2v = faiss.IndexFlatIP(300) faiss_w2v.add(np.ascontiguousarray(w2v.astype("float32"))) with open("./results/faiss.ppmi.out.txt", "w") as ppmi_out, open("./results/faiss.w2v.out.txt", "w") as w2v_out: for line in open("./results/knock91.output.txt", "r", encoding="utf-8"):
import numpy as np import sys, pathlib from knock90 import multi_vec from scipy.io import loadmat from tqdm import tqdm chap08 = pathlib.Path().parent / ".." / "chapter08" chap09 = pathlib.Path().parent / ".." / "chapter09" sys.path.extend([str(chap08), str(chap09)]) from knock72 import serialize, deserialize from knock87 import cosine_similarity from knock80 import file_reader w2v, w2v_index = deserialize("matrix"), deserialize("t_index") ppmi, ppmi_index = loadmat("knock85.matrix")["knock85.matrix"], deserialize( "p_index") def apply_w2v(line: str, f) -> None: t1, t2, t3, *_ = line.rstrip().split() res = multi_vec(w2v, w2v_index, t1, t2, t3) if not res: print(line.rstrip(), "-", "-", file=f) else: word, sim = res[0] print(line.rstrip(), word, sim, file=f) def apply_ppmi(line: str, f) -> None: