예제 #1
0
def fetch_all_content_result(document_list):
    print("in fetch")
    content_all_list = []
    result_all_list = []
    i = 0
    myseg = MySegment()
    for document in document_list:
        content, result = content_result(myseg, document)
        content_all_list.append(content)
        result_all_list.append(result)
        i += 1
    print(i)
    myseg.close()
    return content_all_list, result_all_list
예제 #2
0
def seg_document(document_list):
    content_list = []
    result_list = []
    i = 0
    myseg = MySegment()
    mypos = MyPostagger()
    for document in document_list:
        # print(i)
        # i = i + 1
        content_wordlist, result_wordlist = content_result(myseg, document)

        content_wordlist = mypos.words2pos(content_wordlist,
                                           ['n', 'nl', 'ns', 'v'])
        result_wordlist = mypos.words2pos(result_wordlist,
                                          ['n', 'nl', 'ns', 'v'])
        content_list.append(content_wordlist)
        result_list.append(result_wordlist)
    myseg.close()
    mypos.close()
    print("----------------------------------------------")
    return content_list, result_list
예제 #3
0
def seg_document(document_list):
    content_list = []
    result_list = []

    myseg = MySegment()
    i = 0
    for document in document_list:
        print(i)
        content_wordlist, result_wordlist = content_result(myseg, document)
        content_list.append(content_wordlist)
        result_list.append(result_wordlist)
        i += 1
    return content_list, result_list

if __name__ == "__main__":

    criminal_list = [
        '交通肇事罪',  # 危险驾驶罪(危险 驾驶罪)
        '过失致人死亡罪',  # 故意杀人罪(故意 杀人 杀人罪) 故意伤害罪(故意 伤害 伤害罪)
        '故意杀人罪',
        '故意伤害罪',
        '过失致人重伤罪',
        '抢劫罪',
        #'诈骗罪', #(诈骗 诈骗罪 诈骗案)
        '拐卖妇女儿童罪'
    ]

    myseg = MySegment()
    dataH = dataHelper()

    case_dict = dict()
    save_list = list()
    reason_list = list()

    for criminal in criminal_list:
        print("~~~~~~~~~~~~~~~~~~{}~~~~~~~~~~~~~~~~~~".format(criminal))
        # case_dict = dict()
        # save_list = list()
        # reason_list = list()

        file_dir = BasePath + criminal + "/"
        dir_list = os.listdir(file_dir)
예제 #5
0
    # print(' '.join(list(return_word_set)))
    return list(return_word_set)


def get_details_words(myseg, mypos, document):
    behavior_set = {"自首", "坦白", "累犯", "谅解", "和解"}
    # print("+++++++++++++++++++++++++++++++")

    decode_document = get_details(document)
    # print(decode_document)
    details_words = myseg.sen2word(decode_document.encode('utf8'))
    # print(' '.join(details_words))
    sdetails_words = list(
        set(mypos.words2pos(details_words, ['n', 'nl', 'ns', 'v'])) -
        behavior_set)
    # print(' '.join(sdetails_words))
    return sdetails_words


if __name__ == "__main__":
    file_path = BasePath + "/data/judgment_yishen.txt"
    print(file_path)
    myseg = MySegment()
    mypos = MyPostagger()
    document_list = read_document(file_path)
    for document in document_list:
        print("___________________________________________________")
        print(get_details(document))
    myseg.close()
    mypos.close()
예제 #6
0
    len(document_all_id_list)))
# 语料向量
x_sample = np.loadtxt(BasePath +
                      "/word2vec_model/corpus_w2v_full_finance_average.txt")
print("load the corpus vector in : {}".format(
    BasePath + "/word2vec_model/corpus_w2v_full_average.txt"))
# # 随机森林训练
clf_filepath = BasePath + "/data/clf_model_full_average.m"
if os.path.exists(clf_filepath):
    print("the model already exists in :{}".format(clf_filepath))
    clf = joblib.load(clf_filepath)
else:
    print("No model loaded!")

# 分词模块
myseg = MySegment()
opt_Document = DocumentsOnMysql()


def rf_similarity(path_vec):
    fea_size = len(path_vec)
    sim_vec = np.zeros([fea_size, fea_size])
    k = 0
    for i in range(0, fea_size):
        for j in range(i, fea_size):
            print(k)
            k = k + 1
            num_path = np.sum(path_vec[i] & path_vec[j])
            sim_vec[i][j] = num_path
            sim_vec[j][i] = num_path
    return sim_vec / sim_vec.diagonal().T