def main(): import numpy as np np.set_printoptions(precision=6) # calc_idf() # idf = load_idf() # gen_vectors() # 计算vector idf = list(z.read_jsonline(f"{CASE_DIR}/output/idf.jsonl")) names = [ "紫金县瓦溪镇康辉药店", "瓦溪镇康辉药店", "紫金瓦溪镇康辉药店", "紫金县瓦溪镇康辉药房", "武汉市江岸区李永康西医内科诊所", "武汉市江岸区李永康诊所", "江岸区李永康诊所", ] with jsonlines.open(f"{CASE_DIR}/output/scores.jsonl", "w") as fw: for name in names: scores = calc_score(name, idf) fw.write({ "name": name, "scores": scores, }) print(name, scores) print("exit")
def calc_vec(vector): vectors = z.read_jsonline(f"{CASE_DIR}/output/vectors.jsonl") scores = [] for v in z.pb(vectors, title="计算分数"): name = v["name"] if not name: continue vec = v["vector"] dist = cosine_distance(vector, vec) # score = round((1.0-dist)*100.0, 6) score = round(dist, 6) scores.append((name, score)) scores = sorted(scores, key=lambda it: it[1]) return scores
def test_read_jsonline(self): from zzpy import read_jsonline from collections.abc import Generator import os # generator file_path = os.path.join("test", "static", "1.jsonl") self.assertIsInstance(read_jsonline(file_path), Generator) # end file without newline data = [] for i in read_jsonline(file_path): data.append(i) self.assertListEqual(data, [{ "name": "Zero", "age": 31 }, { "name": "Flyoung", "age": 17 }]) # generator file_path = os.path.join("test", "static", "2.jsonl") self.assertIsInstance(read_jsonline(file_path), Generator) # end file with newline data = [] for i in read_jsonline(file_path): data.append(i) self.assertListEqual(data, [{ "name": "Zero", "age": 31 }, { "name": "Flyoung", "age": 17 }])
def gen_vector(name, slices=[], idf=[]): if not slices: slices = split_name(name) print(f"{name}: {slices}") slices = [s[0] for s in slices] if not idf: idf = list(z.read_jsonline(f"{CASE_DIR}/output/idf.jsonl")) vector = [] for item in idf: word, weight = item["word"], item["weight"] if word in slices: vector.append(weight) else: vector.append(0) return vector
def gen_vectors(): """ 生成向量 """ idf = list(z.read_jsonline(f"{CASE_DIR}/output/idf.jsonl")) with jsonlines.open(f"{CASE_DIR}/output/vectors.jsonl", "w") as fw: for it in z.read_jsonline_with_progressbar( f"{CASE_DIR}/output/std.jsonl", title="生成向量"): name = it.get("name", "") # keys = [key for key in it.get("result", [])] keys = [key for key, _ in it.get("result", [])] vector = gen_vector(name=name, slices=keys, idf=idf) fw.write({ "name": name, "vector": vector, })
def load_matchers(path): import zzpy as z return [load_matcher(config) for config in z.read_jsonline(path)]