def fetch_all_content_result(document_list): print("in fetch") content_all_list = [] result_all_list = [] i = 0 myseg = MySegment() for document in document_list: content, result = content_result(myseg, document) content_all_list.append(content) result_all_list.append(result) i += 1 print(i) myseg.close() return content_all_list, result_all_list
def seg_document(document_list): content_list = [] result_list = [] i = 0 myseg = MySegment() mypos = MyPostagger() for document in document_list: # print(i) # i = i + 1 content_wordlist, result_wordlist = content_result(myseg, document) content_wordlist = mypos.words2pos(content_wordlist, ['n', 'nl', 'ns', 'v']) result_wordlist = mypos.words2pos(result_wordlist, ['n', 'nl', 'ns', 'v']) content_list.append(content_wordlist) result_list.append(result_wordlist) myseg.close() mypos.close() print("----------------------------------------------") return content_list, result_list
def seg_document(document_list): content_list = [] result_list = [] myseg = MySegment() i = 0 for document in document_list: print(i) content_wordlist, result_wordlist = content_result(myseg, document) content_list.append(content_wordlist) result_list.append(result_wordlist) i += 1 return content_list, result_list
if __name__ == "__main__": criminal_list = [ '交通肇事罪', # 危险驾驶罪(危险 驾驶罪) '过失致人死亡罪', # 故意杀人罪(故意 杀人 杀人罪) 故意伤害罪(故意 伤害 伤害罪) '故意杀人罪', '故意伤害罪', '过失致人重伤罪', '抢劫罪', #'诈骗罪', #(诈骗 诈骗罪 诈骗案) '拐卖妇女儿童罪' ] myseg = MySegment() dataH = dataHelper() case_dict = dict() save_list = list() reason_list = list() for criminal in criminal_list: print("~~~~~~~~~~~~~~~~~~{}~~~~~~~~~~~~~~~~~~".format(criminal)) # case_dict = dict() # save_list = list() # reason_list = list() file_dir = BasePath + criminal + "/" dir_list = os.listdir(file_dir)
# print(' '.join(list(return_word_set))) return list(return_word_set) def get_details_words(myseg, mypos, document): behavior_set = {"自首", "坦白", "累犯", "谅解", "和解"} # print("+++++++++++++++++++++++++++++++") decode_document = get_details(document) # print(decode_document) details_words = myseg.sen2word(decode_document.encode('utf8')) # print(' '.join(details_words)) sdetails_words = list( set(mypos.words2pos(details_words, ['n', 'nl', 'ns', 'v'])) - behavior_set) # print(' '.join(sdetails_words)) return sdetails_words if __name__ == "__main__": file_path = BasePath + "/data/judgment_yishen.txt" print(file_path) myseg = MySegment() mypos = MyPostagger() document_list = read_document(file_path) for document in document_list: print("___________________________________________________") print(get_details(document)) myseg.close() mypos.close()
len(document_all_id_list))) # 语料向量 x_sample = np.loadtxt(BasePath + "/word2vec_model/corpus_w2v_full_finance_average.txt") print("load the corpus vector in : {}".format( BasePath + "/word2vec_model/corpus_w2v_full_average.txt")) # # 随机森林训练 clf_filepath = BasePath + "/data/clf_model_full_average.m" if os.path.exists(clf_filepath): print("the model already exists in :{}".format(clf_filepath)) clf = joblib.load(clf_filepath) else: print("No model loaded!") # 分词模块 myseg = MySegment() opt_Document = DocumentsOnMysql() def rf_similarity(path_vec): fea_size = len(path_vec) sim_vec = np.zeros([fea_size, fea_size]) k = 0 for i in range(0, fea_size): for j in range(i, fea_size): print(k) k = k + 1 num_path = np.sum(path_vec[i] & path_vec[j]) sim_vec[i][j] = num_path sim_vec[j][i] = num_path return sim_vec / sim_vec.diagonal().T