def getContent(string): HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth='MTk4N0BiYnMuaGFua2NzLmNvbTowSUdEZnhYZUhhN3JvOVBR' ) # Fill in your auth raw_result = HanLP(string, tasks=["ner/pku", "ner/msra", "ner/ontonotes"]) result = {} for name in ["ner/pku", "ner/msra", "ner/ontonotes"]: lenLs = len(raw_result[name]) ls = [] if (lenLs > 0): for i in range(lenLs): for j in raw_result[name][i]: ls.append(j) result[name] = ls # print("raw_result:\n",raw_result) def classifyStd(result, std_list=["ner/pku", "ner/msra", "ner/ontonotes"]): ''' Parameters ---------- result : hanlp处理结果 std_list : 标准库列表. The default is ["ner/pku", "ner/msra", "ner/ontonotes"]. Returns ------- all_results : 将每个标准下相同字段整合分类的结果 ''' all_results = {} for std in std_list: classified_result = {} for item in result[std]: item_type = item[1] item_content = item[0] if item_type not in classified_result.keys(): classified_result[item_type] = [] classified_result[item_type].append(item_content) all_results[std] = classified_result return all_results def classifyTime(result): ''' Parameters ---------- result : list 单个标准库下内容 Returns ------- dict 将相邻地点和相邻时间看作一组,分别划分出来,可以在此基础上做个一一对应 ''' time_list = [] pos_list = [] temp_list = [] pre_time = False pre_pos = False for item in result: if item[1] == 'DATE': if pre_time: temp_list.append(item[0]) else: pre_time = True pre_pos = False if len(temp_list) > 0: pos_list.append(temp_list) temp_list = [item[0]] elif item[1] == 'ORGANIZATION': if pre_pos: temp_list.append(item[0]) else: pre_time = False pre_pos = True if len(temp_list) > 0: time_list.append(temp_list) temp_list = [item[0]] elif item[1] == 'PHONE': time_list = [] pos_list = [] temp_list = [] if pre_time: time_list.append(temp_list) else: pos_list.append(temp_list) return {'time': time_list, 'pos': pos_list} # print("classify_std:\n", classifyStd(result)['ner/pku']) # print("classify_std:\n", classifyStd(result)["ner/msra"]) # print("classifyTime:\n", classifyTime(result["ner/msra"])) def sortTimeData(dic): ls1 = dic['time'] ls2 = dic['pos'] ls = [] def getkey(a): i = 0 while not a['time'][i][:4].isdigit(): i += 1 return int(a['time'][i][:4]) def getkey1(b): j = 0 while not b[j][:4].isdigit(): j += 1 return int(b[j][:4]) for i in range(len(ls1)): if i >= len(ls2): break ls.append({ 'time': ls1[i], 'pos': ls2[i], 'timeKey': getkey1(ls1[i]) }) # print(ls1) # print(ls2) # print(ls, "\n\n\n") ls.sort(key=lambda x: x['timeKey']) existedPos = set() delItems = [] for i in ls: if i['pos'][0].strip() in existedPos: delItems.append(i) else: existedPos.add(i['pos'][0].strip()) for item in delItems: ls.remove(item) return ls timeData = classifyTime(result["ner/msra"]) sortMsra = sortTimeData(timeData) # print(timeData)timeData def getPkuPosList(): return set(classifyStd(result)['ner/pku']['nt']) def mergePosAndTime(): posListByPkuStd = set(classifyStd(result)['ner/pku']['nt']) timeData = classifyTime(result["ner/msra"]) msra_list = sortTimeData(timeData) merge_list = [] for pos in posListByPkuStd: for msra_item in msra_list: pos_list = msra_item['pos'] for msra_pos in pos_list: if msra_pos == pos: merge_list.append({ 'pos': pos, 'time': msra_item['timeKey'] }) return merge_list # print("mergeList:\n", mergePosAndTime()) return [sortMsra, mergePosAndTime(), getPkuPosList()]
def setUp(self) -> None: self.HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None) # Fill in your auth