def sampling_train(total_num=10000): logger = myutil.getLogger("sample.log") db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes num = 0 for line in cases_set.find({ "flag": 12 }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words_2 = line["ygscWords2"].split(" ") if 10 < len(ygsc_words_2) < 30: num += 1 cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": 2 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) for ftid in line["ftids"]: statutes_set.update( {"_id": ftid}, {'$inc': { "sampleTrainCount": 1 }}, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) if num == total_num: break
def case_fenci_second_patch(): logger = myutil.getLogger("fenci_patch.log") db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes for line in cases_set.find({ "flag": 10, "patch": { "$exists": True } }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words_2 = line["ygscWords2"].split(" ") if 3 < len(ygsc_words_2) <= 80: cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": 2 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) for ftid in line["ftids"]: statutes_set.update( {"_id": ftid}, {'$inc': { "trainCount": 1 }}, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def prepareLabels(flag=2): ''' 准备训练集或验证集的label :param flag: :return: ''' logger = myutil.getLogger("label.log") statute_dict = {} statute_index = 0 db = dbutil.get_mongodb_conn() cases_set = db.cases if flag == 2: statutes_set = db.statutes statute_num = statutes_set.count( {"sampleTrainCount": { "$exists": True }}) else: with open("checkpoint/statute_dict.pk", "rb") as file: statute_dict = joblib.load(file) statute_num = len(statute_dict) for line in cases_set.find({ "flag": flag }, { "ftids": 1 }, no_cursor_timeout=True).batch_size(20): logger.info(line["_id"]) label = [0 for i in range(statute_num)] legal = True for ftid in line["ftids"]: if ftid in statute_dict: label[statute_dict[ftid]] = 1 # 直接赋值为1 else: if flag == 2: statute_dict[ftid] = statute_index # 加入dict里面没有的 label[statute_index] = 1 # 赋值为1 statute_index += 1 # 更新计数 else: logger.error("出现不在训练集的法条:%s" % line["_id"]) legal = False break if legal: cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "label": label }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) # 保存dict if flag == 2: with open("checkpoint/statute_dict.pk", "wb") as file: joblib.dump(statute_dict, file)
def case_fenci_second(): logger = myutil.getLogger("fenci.log") db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes words_set = db.words for line in cases_set.find( { "flag": { "$ne": 0 }, "ygscWords2": { "$exists": False } }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml flag = line["flag"] ygsc_words = line["ygscWords"].split(" ") ygsc_words_2 = [] # 1:进行词筛选处理 for word in ygsc_words: # 1.1 非停用词和低频词、如果非训练集,还要把未出现的词删掉 word_db = words_set.find_one({"_id": word}) if word_db is not None and __not_stopwords(word_db): # 1.2: 连续五个词中未重复 found = False end = len(ygsc_words_2) start = max(0, end - 5) for i in range(start, end): if ygsc_words_2[i] == word: found = True break if not found: ygsc_words_2.append(word) # 2:处理后词长过长的(短的其实效果很好,没有人名地名什么的) if len(ygsc_words_2) < 3 or len(ygsc_words_2) > 80: flag = 10 elif flag == 2: # 否则训练集统计法条引用数 for ftid in line["ftids"]: statutes_set.update( {"_id": ftid}, {'$inc': { "trainCount": 1 }}, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "ygscWords2": " ".join(ygsc_words_2), "flag": flag }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def clean_by_statutes(): logger = myutil.getLogger("clean.log") statutes_list = get_statutes_set() db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes for line in cases_set.find( { "flag": { "$ne": 0, "$lt": 5 }, "clean": { "$exists": False } }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml flag = line["flag"] for ftid in line["ftids"]: if ftid not in statutes_list: flag = 9 break if flag == 3 or flag == 4: # 测试集、验证集统计 if flag == 3: # 测试集 col = "testCount" else: col = "validCount" for ftid in line["ftids"]: statutes_set.update( {"_id": ftid}, {'$inc': { col: 1 }}, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": flag, "clean": 0 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def top_k_test(type="annoy"): logger = myutil.getLogger("try.log") if type == "annoy": model = AnnoyIndex(300) model.load('checkpoint/annoy.pk') elif type == "kd_tree": with open("checkpoint/kd_tree.pk", "rb") as file: model = joblib.load(file) elif type == "ball_tree": with open("checkpoint/ball_tree.pk", "rb") as file: model = joblib.load(file) max_num = 0.0 min_num = 1000.0 sum_num = 0.0 count = 0 for i in range(100000): vec = numpy.random.uniform(-1, 1, size=300) start = time.time() if type == "annoy": # 通过第几个item查询:get_nns_by_item 通过向量查询:get_nns_by_vector words, dis = model.get_nns_by_vector(vec, 100, include_distances=True) # for id in words: # print(id) else: dis, ind = model.query([vec], k=100) # for j in range(len(ind[0])): # print(ind[0][j], dis[0][j]) stop = time.time() # 更新 run_time = float(stop - start) sum_num += run_time count += 1 if run_time > max_num: max_num = run_time if run_time < min_num: min_num = run_time logger.info("%s, max: %f, min: %f, avg: %f, count: %f" % (type, max_num, min_num, (sum_num / count), count))
def top_k_train(): logger = myutil.getLogger("try.log") emd = load_word_embedding() logger.info("vector size: %d" % len(emd)) annoy_model = AnnoyIndex(300) for (i, vec) in enumerate(emd): annoy_model.add_item(i, vec) annoy_model.build(50) # 建20棵树,树越大越精确 annoy_model.save('checkpoint/annoy.pk') ball_tree = BallTree(emd) with open("checkpoint/ball_tree.pk", "wb") as file: joblib.dump(ball_tree, file) kd_tree = KDTree(emd) with open("checkpoint/kd_tree.pk", "wb") as file: joblib.dump(kd_tree, file)
def case_fenci_second_patch_test(): logger = myutil.getLogger("fenci_patch.log") db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({ "flag": 10 }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words_2 = line["ygscWords2"].split(" ") if 3 < len(ygsc_words_2) <= 80: cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": 4 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def sampling_test(total_num=1000): logger = myutil.getLogger("sample_test.log") db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes num = 0 for line in cases_set.find({ "flag": 14 }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words_2 = line["ygscWords2"].split(" ") if 10 < len(ygsc_words_2) < 30: ftlegal = True for ftid in line["ftids"]: statute_db = statutes_set.find_one({ "_id": ftid, "sampleTrainCount": { "$exists": True } }) if statute_db is None: ftlegal = False break if ftlegal: num += 1 cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": 4 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) if num == total_num: break
def trainWordvector(genFile=False, corpusFile="checkpoint/corpus.txt"): config = myutil.read_config("conf/fttj.conf") logger = myutil.getLogger("parsexml.log") # 1: 生成corpus文档 if genFile: with open(corpusFile, 'w', encoding="utf-8") as corpus: # 案件 dir = config["corpus_dir"] for file in os.listdir(dir): try: dom = xml.dom.minidom.parse(dir + '/' + file) nodelist = dom.documentElement.getElementsByTagName( "AJJBQK") if len(nodelist) > 0: text = nodelist[0].getAttribute("value") __appendToFile(text, corpus) except xml.parsers.expat.ExpatError: logger.error("%s编码错误" % file) # 法条 db = dbutil.get_mongodb_conn() statutes_set = db.statutes for line in statutes_set.find(): __appendToFile(line["content"], corpus) # 2: 训练词向量 sentences = word2vec.LineSentence(corpusFile) model = word2vec.Word2Vec(sentences, min_count=50, size=config["embedding_size"]) vocabulary, word_embedding = __get_word_vector(model.wv) # 3: 保存模型 with open("checkpoint/vocabulary.pk", "wb") as file: joblib.dump(vocabulary, file) with open("checkpoint/wordvector.pk", "wb") as file: joblib.dump(word_embedding, file)
def case_fenci_patch(): logger = myutil.getLogger("fenci_patch.log") db = dbutil.get_mongodb_conn() cases_set = db.cases words_set = db.words # for line in cases_set.find({"flag": {"$ne": 0}}, no_cursor_timeout=True).batch_size(10): # # # 未处理前结果 # words = psg.cut(line["ygsc"]) # ygsc_words_ori = [] # for (w, flag) in words: # ygsc_words_ori.append(w) # # cases_set.update( # {"_id": line["_id"]}, # 更新条件 # {'$set': {"ygscWordsOrigin": " ".join(ygsc_words_ori)}}, # 更新内容 # upsert=False, # 如果不存在update的记录,是否插入 # multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 # ) for line in cases_set.find({ "flag": 2, "patch": { "$exists": False } }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words = line["ygscWords"].split(" ") # 处理后分词 ygsc_words_set = set(ygsc_words) for word in ygsc_words_set: word_db = words_set.find_one({"_id": word}) if word_db is None: # 新词 words_set.insert_one({ "_id": word, "totalCount": 1, "ayCount": { str(line["aydm"]): 1 } }) else: if str(line["aydm"]) in word_db["ayCount"]: ay_count = word_db["ayCount"][str(line["aydm"])] + 1 else: # 新案由 ay_count = 1 ay_name = "ayCount." + str(line["aydm"]) words_set.update( {"_id": word}, { '$set': { ay_name: ay_count }, '$inc': { "totalCount": 1 } }, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "patch": 0 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def case_fenci_first(): logger = myutil.getLogger("fenci.log") db = dbutil.get_mongodb_conn() cases_set = db.cases words_set = db.words for line in cases_set.find({ "flag": 1 }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words = fenci(line["ygsc"]) # 预处理后的分词结果 # 1:词长小于指定长度的 if len(ygsc_words) < 20: flag = 0 # 更新 cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "ygscWords": " ".join(ygsc_words), "flag": flag }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) else: # 未处理的分词 words = psg.cut(line["ygsc"]) ygsc_words_ori = [] for (w, flag) in words: ygsc_words_ori.append(w) r = random.random() if r < 0.5: # train flag = 2 # 2:训练集计算词的信息熵 ygsc_words_set = set(ygsc_words) for word in ygsc_words_set: word_db = words_set.find_one({"_id": word}) if word_db is None: # 新词 words_set.insert_one({ "_id": word, "totalCount": 1, "ayCount": { str(line["aydm"]): 1 } }) else: if str(line["aydm"]) in word_db["ayCount"]: ay_count = word_db["ayCount"][str( line["aydm"])] + 1 else: # 新案由 ay_count = 1 ay_name = "ayCount." + str(line["aydm"]) words_set.update( {"_id": word}, { '$set': { ay_name: ay_count }, '$inc': { "totalCount": 1 } }, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) elif r < 0.9: # test flag = 3 else: # trial flag = 4 cases_set.update( {"_id": line["_id"]}, # 更新条件 { '$set': { "ygscWordsOrigin": " ".join(ygsc_words_ori), "ygscWords": " ".join(ygsc_words), "flag": flag } }, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def __load_logger(logfile="test.log"): global logger if logger is None: logger = myutil.getLogger(logfile)