Example #1
0
def sampling_train(total_num=10000):
    logger = myutil.getLogger("sample.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes

    num = 0

    for line in cases_set.find({
            "flag": 12
    }, no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words_2 = line["ygscWords2"].split(" ")

        if 10 < len(ygsc_words_2) < 30:
            num += 1
            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {'$set': {
                    "flag": 2
                }},  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )
            for ftid in line["ftids"]:
                statutes_set.update(
                    {"_id": ftid},
                    {'$inc': {
                        "sampleTrainCount": 1
                    }},
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )
        if num == total_num:
            break
Example #2
0
def case_fenci_second_patch():
    logger = myutil.getLogger("fenci_patch.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes

    for line in cases_set.find({
            "flag": 10,
            "patch": {
                "$exists": True
            }
    },
                               no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words_2 = line["ygscWords2"].split(" ")

        if 3 < len(ygsc_words_2) <= 80:
            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {'$set': {
                    "flag": 2
                }},  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )
            for ftid in line["ftids"]:
                statutes_set.update(
                    {"_id": ftid},
                    {'$inc': {
                        "trainCount": 1
                    }},
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )
Example #3
0
def prepareLabels(flag=2):
    '''
    准备训练集或验证集的label
    :param flag:
    :return:
    '''
    logger = myutil.getLogger("label.log")
    statute_dict = {}
    statute_index = 0
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases

    if flag == 2:
        statutes_set = db.statutes
        statute_num = statutes_set.count(
            {"sampleTrainCount": {
                "$exists": True
            }})
    else:
        with open("checkpoint/statute_dict.pk", "rb") as file:
            statute_dict = joblib.load(file)
        statute_num = len(statute_dict)

    for line in cases_set.find({
            "flag": flag
    }, {
            "ftids": 1
    },
                               no_cursor_timeout=True).batch_size(20):
        logger.info(line["_id"])
        label = [0 for i in range(statute_num)]
        legal = True
        for ftid in line["ftids"]:
            if ftid in statute_dict:
                label[statute_dict[ftid]] = 1  # 直接赋值为1
            else:
                if flag == 2:
                    statute_dict[ftid] = statute_index  # 加入dict里面没有的
                    label[statute_index] = 1  # 赋值为1
                    statute_index += 1  # 更新计数
                else:
                    logger.error("出现不在训练集的法条:%s" % line["_id"])
                    legal = False
                    break

        if legal:
            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {'$set': {
                    "label": label
                }},  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )

    # 保存dict
    if flag == 2:
        with open("checkpoint/statute_dict.pk", "wb") as file:
            joblib.dump(statute_dict, file)
Example #4
0
def case_fenci_second():
    logger = myutil.getLogger("fenci.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes
    words_set = db.words
    for line in cases_set.find(
        {
            "flag": {
                "$ne": 0
            },
            "ygscWords2": {
                "$exists": False
            }
        },
            no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        flag = line["flag"]
        ygsc_words = line["ygscWords"].split(" ")
        ygsc_words_2 = []
        # 1:进行词筛选处理
        for word in ygsc_words:
            # 1.1 非停用词和低频词、如果非训练集,还要把未出现的词删掉
            word_db = words_set.find_one({"_id": word})
            if word_db is not None and __not_stopwords(word_db):
                # 1.2: 连续五个词中未重复
                found = False
                end = len(ygsc_words_2)
                start = max(0, end - 5)
                for i in range(start, end):
                    if ygsc_words_2[i] == word:
                        found = True
                        break
                if not found:
                    ygsc_words_2.append(word)

        # 2:处理后词长过长的(短的其实效果很好,没有人名地名什么的)
        if len(ygsc_words_2) < 3 or len(ygsc_words_2) > 80:
            flag = 10
        elif flag == 2:  # 否则训练集统计法条引用数
            for ftid in line["ftids"]:
                statutes_set.update(
                    {"_id": ftid},
                    {'$inc': {
                        "trainCount": 1
                    }},
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )

        cases_set.update(
            {"_id": line["_id"]},  # 更新条件
            {'$set': {
                "ygscWords2": " ".join(ygsc_words_2),
                "flag": flag
            }},  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Example #5
0
def clean_by_statutes():
    logger = myutil.getLogger("clean.log")
    statutes_list = get_statutes_set()
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes
    for line in cases_set.find(
        {
            "flag": {
                "$ne": 0,
                "$lt": 5
            },
            "clean": {
                "$exists": False
            }
        },
            no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        flag = line["flag"]

        for ftid in line["ftids"]:
            if ftid not in statutes_list:
                flag = 9
                break

        if flag == 3 or flag == 4:  # 测试集、验证集统计
            if flag == 3:  # 测试集
                col = "testCount"
            else:
                col = "validCount"

            for ftid in line["ftids"]:
                statutes_set.update(
                    {"_id": ftid},
                    {'$inc': {
                        col: 1
                    }},
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )

        cases_set.update(
            {"_id": line["_id"]},  # 更新条件
            {'$set': {
                "flag": flag,
                "clean": 0
            }},  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Example #6
0
def top_k_test(type="annoy"):
    logger = myutil.getLogger("try.log")

    if type == "annoy":
        model = AnnoyIndex(300)
        model.load('checkpoint/annoy.pk')
    elif type == "kd_tree":
        with open("checkpoint/kd_tree.pk", "rb") as file:
            model = joblib.load(file)
    elif type == "ball_tree":
        with open("checkpoint/ball_tree.pk", "rb") as file:
            model = joblib.load(file)

    max_num = 0.0
    min_num = 1000.0
    sum_num = 0.0
    count = 0
    for i in range(100000):
        vec = numpy.random.uniform(-1, 1, size=300)
        start = time.time()
        if type == "annoy":
            # 通过第几个item查询:get_nns_by_item  通过向量查询:get_nns_by_vector
            words, dis = model.get_nns_by_vector(vec,
                                                 100,
                                                 include_distances=True)
            # for id in words:
            #     print(id)
        else:
            dis, ind = model.query([vec], k=100)
            # for j in range(len(ind[0])):
            #     print(ind[0][j], dis[0][j])
        stop = time.time()

        # 更新
        run_time = float(stop - start)
        sum_num += run_time
        count += 1
        if run_time > max_num:
            max_num = run_time
        if run_time < min_num:
            min_num = run_time

    logger.info("%s, max: %f, min: %f, avg: %f, count: %f" %
                (type, max_num, min_num, (sum_num / count), count))
Example #7
0
def top_k_train():
    logger = myutil.getLogger("try.log")

    emd = load_word_embedding()
    logger.info("vector size: %d" % len(emd))

    annoy_model = AnnoyIndex(300)
    for (i, vec) in enumerate(emd):
        annoy_model.add_item(i, vec)
    annoy_model.build(50)  # 建20棵树,树越大越精确
    annoy_model.save('checkpoint/annoy.pk')

    ball_tree = BallTree(emd)
    with open("checkpoint/ball_tree.pk", "wb") as file:
        joblib.dump(ball_tree, file)

    kd_tree = KDTree(emd)
    with open("checkpoint/kd_tree.pk", "wb") as file:
        joblib.dump(kd_tree, file)
Example #8
0
def case_fenci_second_patch_test():
    logger = myutil.getLogger("fenci_patch.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases

    for line in cases_set.find({
            "flag": 10
    }, no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words_2 = line["ygscWords2"].split(" ")

        if 3 < len(ygsc_words_2) <= 80:
            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {'$set': {
                    "flag": 4
                }},  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )
Example #9
0
def sampling_test(total_num=1000):
    logger = myutil.getLogger("sample_test.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes

    num = 0

    for line in cases_set.find({
            "flag": 14
    }, no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words_2 = line["ygscWords2"].split(" ")

        if 10 < len(ygsc_words_2) < 30:
            ftlegal = True
            for ftid in line["ftids"]:
                statute_db = statutes_set.find_one({
                    "_id": ftid,
                    "sampleTrainCount": {
                        "$exists": True
                    }
                })
                if statute_db is None:
                    ftlegal = False
                    break
            if ftlegal:
                num += 1
                cases_set.update(
                    {"_id": line["_id"]},  # 更新条件
                    {'$set': {
                        "flag": 4
                    }},  # 更新内容
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )

        if num == total_num:
            break
Example #10
0
def trainWordvector(genFile=False, corpusFile="checkpoint/corpus.txt"):
    config = myutil.read_config("conf/fttj.conf")
    logger = myutil.getLogger("parsexml.log")

    # 1: 生成corpus文档
    if genFile:
        with open(corpusFile, 'w', encoding="utf-8") as corpus:
            # 案件
            dir = config["corpus_dir"]
            for file in os.listdir(dir):
                try:
                    dom = xml.dom.minidom.parse(dir + '/' + file)
                    nodelist = dom.documentElement.getElementsByTagName(
                        "AJJBQK")
                    if len(nodelist) > 0:
                        text = nodelist[0].getAttribute("value")
                        __appendToFile(text, corpus)
                except xml.parsers.expat.ExpatError:
                    logger.error("%s编码错误" % file)

            # 法条
            db = dbutil.get_mongodb_conn()
            statutes_set = db.statutes
            for line in statutes_set.find():
                __appendToFile(line["content"], corpus)

    # 2: 训练词向量
    sentences = word2vec.LineSentence(corpusFile)
    model = word2vec.Word2Vec(sentences,
                              min_count=50,
                              size=config["embedding_size"])
    vocabulary, word_embedding = __get_word_vector(model.wv)

    # 3: 保存模型
    with open("checkpoint/vocabulary.pk", "wb") as file:
        joblib.dump(vocabulary, file)
    with open("checkpoint/wordvector.pk", "wb") as file:
        joblib.dump(word_embedding, file)
Example #11
0
def case_fenci_patch():
    logger = myutil.getLogger("fenci_patch.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    words_set = db.words

    # for line in cases_set.find({"flag": {"$ne": 0}}, no_cursor_timeout=True).batch_size(10):
    #
    #     # 未处理前结果
    #     words = psg.cut(line["ygsc"])
    #     ygsc_words_ori = []
    #     for (w, flag) in words:
    #         ygsc_words_ori.append(w)
    #
    #     cases_set.update(
    #         {"_id": line["_id"]},  # 更新条件
    #         {'$set': {"ygscWordsOrigin": " ".join(ygsc_words_ori)}},  # 更新内容
    #         upsert=False,  # 如果不存在update的记录,是否插入
    #         multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
    #     )

    for line in cases_set.find({
            "flag": 2,
            "patch": {
                "$exists": False
            }
    },
                               no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words = line["ygscWords"].split(" ")  # 处理后分词
        ygsc_words_set = set(ygsc_words)
        for word in ygsc_words_set:
            word_db = words_set.find_one({"_id": word})
            if word_db is None:  # 新词
                words_set.insert_one({
                    "_id": word,
                    "totalCount": 1,
                    "ayCount": {
                        str(line["aydm"]): 1
                    }
                })
            else:
                if str(line["aydm"]) in word_db["ayCount"]:
                    ay_count = word_db["ayCount"][str(line["aydm"])] + 1
                else:  # 新案由
                    ay_count = 1
                ay_name = "ayCount." + str(line["aydm"])
                words_set.update(
                    {"_id": word},
                    {
                        '$set': {
                            ay_name: ay_count
                        },
                        '$inc': {
                            "totalCount": 1
                        }
                    },
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )

        cases_set.update(
            {"_id": line["_id"]},  # 更新条件
            {'$set': {
                "patch": 0
            }},  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Example #12
0
def case_fenci_first():
    logger = myutil.getLogger("fenci.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    words_set = db.words

    for line in cases_set.find({
            "flag": 1
    }, no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words = fenci(line["ygsc"])  # 预处理后的分词结果

        # 1:词长小于指定长度的
        if len(ygsc_words) < 20:
            flag = 0

            # 更新
            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {'$set': {
                    "ygscWords": " ".join(ygsc_words),
                    "flag": flag
                }},  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )
        else:
            # 未处理的分词
            words = psg.cut(line["ygsc"])
            ygsc_words_ori = []
            for (w, flag) in words:
                ygsc_words_ori.append(w)

            r = random.random()
            if r < 0.5:  # train
                flag = 2
                # 2:训练集计算词的信息熵
                ygsc_words_set = set(ygsc_words)
                for word in ygsc_words_set:
                    word_db = words_set.find_one({"_id": word})
                    if word_db is None:  # 新词
                        words_set.insert_one({
                            "_id": word,
                            "totalCount": 1,
                            "ayCount": {
                                str(line["aydm"]): 1
                            }
                        })
                    else:
                        if str(line["aydm"]) in word_db["ayCount"]:
                            ay_count = word_db["ayCount"][str(
                                line["aydm"])] + 1
                        else:  # 新案由
                            ay_count = 1
                        ay_name = "ayCount." + str(line["aydm"])
                        words_set.update(
                            {"_id": word},
                            {
                                '$set': {
                                    ay_name: ay_count
                                },
                                '$inc': {
                                    "totalCount": 1
                                }
                            },
                            upsert=False,  # 如果不存在update的记录,是否插入
                            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                        )

            elif r < 0.9:  # test
                flag = 3
            else:  # trial
                flag = 4

            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {
                    '$set': {
                        "ygscWordsOrigin": " ".join(ygsc_words_ori),
                        "ygscWords": " ".join(ygsc_words),
                        "flag": flag
                    }
                },  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )
Example #13
0
def __load_logger(logfile="test.log"):
    global logger
    if logger is None:
        logger = myutil.getLogger(logfile)