def extractdir(path, dirname1, dirname2): #获取所有已经分词的文件 text_list = [] for root, dirs, files in os.walk(path): #默认为广度优先 if files: for name in files: text_list.append(os.path.join(root, name)) #读取正则表达式 filename = '/home/chenxl/data_mining_resume/task_2/regexpression.txt' dict_ = read_regex(filename) #信息抽取 for txt in text_list: # print(txt) print("extracting the file %s" % (txt)) dirpath = op.dirname(txt) #获取路径 dirpath = dirpath.replace(dirname1, dirname2) # print(dirpath) if not op.exists(dirpath): #创建简历文件夹 os.mkdir(dirpath) txtname = op.basename(txt) #获取最底层的名字(文件名) save_path = op.join(dirpath, txtname) #将二者结合成新的路径 lines = read_academician(txt) dict_jiben = traversal(dict_, lines) work, time = main_han('./task_2/title_list.py', txt) dict_jiben["人物履历"]["工作经历"]["人物经历"] = work dict_jiben["人物履历"]["工作经历"]["任职经历"] = time save_dict(dict_jiben, save_path) print("save the resumefile %s" % (save_path)) print("一共抽取了%s个文件。" % (len(text_list)))
def extract_from_file(raw_info_path, reg_file): #先获取院士文件夹下所有的文件,进行抽取 text_list = [] for root, dirs, files in os.walk(raw_info_path): #默认为广度优先,root为当前目录, #dirs为当前目录下的目录,files为当前目录下的文件。 # print(root,files) if files: #如果文件目录不空 for name in files: #遍历每一个文件 text_list.append(os.path.join(root, name)) #组合成新的文件名 #对每一个文件进行分词与抽取 for txt in text_list: print(txt + " 正在被处理...............") dirpath = op.dirname(txt) #获取txt的路径 dirpath = dirpath.replace("原始院士信息", '抽取的院士信息') if not op.exists(dirpath): #创建新文件夹 os.mkdir(dirpath) txtname = op.basename(txt) #获取最底层的名字(文件名) with open(txt, "r", encoding='utf-8') as f: #将文件内容读出来 text = f.readlines() text_cut = words_cut(text, False) #分词处理,使用波森分词 work = main_han(text_cut) #工作经历抽取 extract_info = traversal(read_regex(reg_file), text_cut) #抽取其它信息 extract_info['人物履历']['工作经历']['博士后'] = '' extract_info["人物履历"]["工作经历"]["任职"] = work extract_info['人物履历']['工作经历']['任免_辞职'] = '' extract_info["院士名"] = txtname.split("_")[0] if "360" in txtname: extract_info["百科名"] = "360百科" if "搜狗百科" in txtname: extract_info["百科名"] = "搜狗百科" if "互动百科" in txtname: extract_info["百科名"] = "互动百科" if "百度百科" in txtname: extract_info["百科名"] = "百度百科" #替换文件夹,将二者结合成新的路径 save_path = op.join(dirpath, txtname).replace(".txt", ".json") with open(save_path, "w", encoding="utf8") as f: json.dump(extract_info, f, indent=4, ensure_ascii=False) print("{}已保存!".format(txtname.replace(".txt", ".json")))
def main_(name): '''name:院士的姓名''' reg_file = "/home/chenxl/data_mining_resume/task_2/regexpression.txt" info = {} #搜狗百科 t1 = time.time() sougou = sougou_extract(name) #爬取信息 t2 = time.time() # print("crawer_time:", t2 - t1) # print("sougou-raw:\n", sougou) t3 = time.time() sougou = words_cut(sougou, False) # 分词处理 t4 = time.time() # print("bosen_cut_time:", t4 - t3) t3 = time.time() words_cut(sougou) # 分词处理 t4 = time.time() # print("jieba_cut_time:", t4 - t3) # print("cut:\n", sougou) work = main_han(sougou) # print(work) sougou = traversal(read_regex(reg_file), sougou) # 抽取信息 sougou['人物履历']['工作经历']['博士后'] = '' sougou["人物履历"]["工作经历"]["任职"] = work sougou['人物履历']['工作经历']['任免_辞职'] = '' sougou["院士名"] = name sougou["百科名"] = "搜狗百科" # print("sougou:\n", sougou) info["sougou"] = sougou #with open(name+"sougou.json","w",encoding="utf8") as f: # json.dump(sougou, f, indent=4, ensure_ascii=False) #百度百科 t1 = time.time() baidu = baidu_extract(name) # 爬取信息 # print("time:", time.time() - t1) # print("baidu-raw:\n", baidu) t0 = time.time() baidu = words_cut(baidu, False) #分词处理 # print("baidu_cut:", baidu) # print("bosen_cut_time:", time.time() - t0) t0 = time.time() words_cut(baidu) #分词处理 # print("jieba_cut_time:", time.time() - t0) t0 = time.time() work = main_han(baidu) #工作经历抽取 # print("main_han_time:", time.time() - t0) # print(work) t0 = time.time() baidu = traversal(read_regex(reg_file), baidu) #抽取其它信息 # print(baidu) # print("ours_time:", time.time() - t0) baidu['人物履历']['工作经历']['博士后'] = '' baidu["人物履历"]["工作经历"]["任职"] = work baidu['人物履历']['工作经历']['任免_辞职'] = '' baidu["院士名"] = name baidu["百科名"] = "百度百科" # print("baidu:\n", baidu) info["baidu"] = baidu #with open(name+"baidu.json","w",encoding="utf8") as f: # json.dump(baidu, f, indent=4, ensure_ascii=False) # #互动百科 t1 = time.time() hudong = hudong_extract(name) # print("time:", time.time() - t1) # print("hudong-raw:\n", hudong) t0 = time.time() hudong = words_cut(hudong, False) # 分词处理 # print("bosen_cut_time:", time.time() - t0) t0 = time.time() words_cut(hudong) # 分词处理 # print("jieba_cut_time:", time.time() - t0) # print("cut:\n", hudong) work = main_han(hudong) hudong = traversal(read_regex(reg_file), hudong) hudong['人物履历']['工作经历']['博士后'] = '' hudong["人物履历"]["工作经历"]["任职"] = work hudong['人物履历']['工作经历']['任免_辞职'] = '' hudong["院士名"] = name hudong["百科名"] = "互动百科" # print("hudong:\n", hudong) info["hudong"] = hudong #with open(name+"hudong.json","w",encoding="utf8") as f: # json.dump(hudong, f, indent=4, ensure_ascii=False) #360百科 t1 = time.time() bk360 = bk360_extract(name) # print("time:", time.time() - t1) # print("bk360-raw:\n", bk360) t0 = time.time() bk360 = words_cut(bk360, False) # 分词处理 # print("bosen_cut_time:", time.time() - t0) t0 = time.time() words_cut(bk360) # 分词处理 # print("jieba_cut_time:", time.time() - t0) # print("cut:\n", bk360) work = main_han(bk360) bk360 = traversal(read_regex(reg_file), bk360) bk360['人物履历']['工作经历']['博士后'] = '' bk360["人物履历"]["工作经历"]["任职"] = work bk360['人物履历']['工作经历']['任免_辞职'] = '' bk360["院士名"] = name bk360["百科名"] = "360百科" # print("360:\n", bk360) info["bk360"] = bk360 #with open(name+"360.json","w",encoding="utf8") as f: # json.dump(bk360, f, indent=4, ensure_ascii=False) with open(name + "info.json", "w", encoding="utf8") as f: json.dump(info, f, indent=4, ensure_ascii=False) extract_result = [baidu, sougou, hudong, bk360] #print(extract_result) return extract_result