def zhuanli_duplicate(): ''' 专利去重 :return: ''' s_sql = "SELECT * FROM `pss_zhuanli_copy` GROUP BY TIVIEW, INVIEW" save_list = dbs.getDics(s_sql) save_dict = dict() print(len(save_list)) for s in save_list: save_dict[str(s['id'])] = "1" s_sql = "SELECT * FROM `pss_zhuanli_copy`" delete_list = dbs.getDics(s_sql) id_list = [] for d in delete_list: if not save_dict.get(str(d['id'])): id_list.append(d['id']) print(len(id_list)) d_sql = ''' DELETE FROM `pss_zhuanli_copy` WHERE id =%s ''' print(dbs.exe_many(d_sql, id_list))
def institution_email(): file = open(DIR + "\\dicts\\institution_email.txt", "a+", encoding="utf-8") email_lines = file.read().split('\n') info_sql = "select id, info from teacherdata_info where id >= 40146" info = dbs.getDics(info_sql) list_ = [] # 生成机构邮箱词典 判定方法为:重复出现的邮箱暂定为机构邮箱 for item in info: if not item["info"]: continue info_text = item["info"] info_text = info_text.replace("[at]", "@") info_text = info_text.replace(" ", "") info_text = info_text.replace("\n", "") email_text = [i[0] for i in re.findall(reEmail, info_text)] if email_text: l2 = sorted(set(email_text), key=email_text.index) # 处理同一个页面重复出现的邮箱地址 list_.extend(l2) print('#'*20) else: print('*'*20) list_1 = [] l3 = sorted(set(list_)) for item in l3: n = list_.count(item) if n > 2 and item not in email_lines: list_1.append(item) print(list_1) print(len(list_1)) file.write("\n".join(list_1)) file.close() pass
def get_paper_data(): begin = 0 end = 6000000 step = 10000 s_sql = ''' SELECT t1.id, t1.`name`, t1.abstract, t2.discipline_code FROM paper_clean1 t1, teacher_dis_code t2 WHERE t2.id = t1.author_id AND t2.discipline_code like '08%%' and t1.id > %s and t1.id <= %s; ''' u_sql = "INSERT paper_data(id, title, abstract, discipline) VALUES(%s, %s, %s, %s);" s = 0 while begin + step <= end: print(s_sql % (str(begin), str(begin + step))) data_list = dbs.getDics(s_sql % (str(begin), str(begin + step))) u_list = [] for data in data_list: if not (data["name"] and data["abstract"]): continue u_list.append((data["id"], data["name"], data["abstract"], data["discipline_code"])) if len(u_list) == 5000: print(dbs.exe_many(u_sql, u_list)) u_list = [] s += 5000 ll = len(u_list) s += ll print(dbs.exe_many(u_sql, u_list)) begin += step print(s)
def get_edu_exp(): select_sql = "select id, info_clear from teacher_eduexp where type = 0" teacher_list = dbs.getDics(select_sql) print(len(teacher_list)) ta = TextAttribute() num = 0 update_list = [] for teacher in teacher_list: if teacher["info_clear"] is None or teacher["info_clear"] == "": continue # print(teacher["id"]) ta.set_text(teacher["info_clear"]) ta.seg_sentence("\n") ta.compute_gravity() t, edu_items = ta.get_edu_items() if edu_items: print(teacher["id"]) print(t, edu_items) num += 1 update_list.append(("\n".join(edu_items), t, teacher["id"])) continue # ta.get_edu_long_item() print(num) print(len(update_list)) update_sql = "update teacher_eduexp set edu_exp=%s, type=%s where id = %s" print(dbs.exe_many(update_sql, update_list))
def data_clean(): """ 将javascript的链接转换为正常 :return: """ data_list = dbs.getDics( "SELECT * FROM `eds_985teacher` WHERE link like '%javascript%' AND school = '中南大学';" ) print(len(data_list)) u_list = [] for data in data_list: id = data['id'] '''javascript:window.open('/blog/content2?name='+encodeURI('周雄伟'))''' link = data['link'] if link != "": p_tuple = re.findall(r"open\('(.+?)'\+encodeURI\('(.+?)'\)\)", link)[0] link = p_tuple[0] + pa.quote(p_tuple[1]) # print(pa.urljoin(data['institution_url'], link)) link = pa.urljoin(data['institution_url'], link) print(link) u_list.append((link, id)) print(len(u_list)) u_sql = "UPDATE eds_985teacher SET all_link=%s WHERE id = %s" print(dbs.exe_many(u_sql, u_list))
def x_t(): import jieba.posseg as pseg s_sql = "SELECT id, title, abstract FROM paper_data WHERE discipline='0812'" data_list = dbs.getDics(s_sql) x_word_dict = dict() for data in data_list: title = data['title'] abstract = data['abstract'] word_list = pseg.cut(title + "\n" + abstract, HMM=True) for w, f in word_list: if f == "x": key = w + "--SPLIT--" + f c = x_word_dict.get(key, 0) x_word_dict[key] = c + 1 pass fw = open('.\\stopword_base.txt', 'w', encoding='utf8') save_list = ["A", "B", "C", "D", "E"] for k, v in x_word_dict.items(): word = k.split('--SPLIT--')[0] flag = k.split('--SPLIT--')[1] f = v
def f(): dis_name = open('dis_name.txt', 'r', encoding='utf8').read().split('\n') dis_name = set(dis_name) dis_school_dict = dict() s_sql = "SELECT school FROM `discipline_school` WHERE `name` = '%s' AND school_id IS NOT NULL" for name in dis_name: print(name) dis_school = [i['school'] for i in dbs.getDics(s_sql % name)] dis_school_dict[name] = dis_school re_li = list() csvreader = csv.reader(open('jishulingyuyuxueke.csv', 'r')) lingyu_xueke = [tuple(node) for node in csvreader] for node in lingyu_xueke: di = dict() d_list = node[1].split('-') for d in d_list: s_list = dis_school_dict[d] for s in s_list: if di.get(s): di[s] += "-" + d else: di[s] = d ll = list() for key, value in di.items(): item = key + '(' + '/'.join(value.split('-')) + ')' ll.append(item) re_li.append(','.join(ll)) print(re_li) print(len(re_li)) print('\n'.join(re_li))
def test(): info_sql = "select id, info from teacherdata_info where id >= 40146" info = dbs.getDics(info_sql) list_ = [] # 生成机构邮箱词典 判定方法为:重复出现的邮箱暂定为机构邮箱 for item in info: if not item["info"]: continue info_text = item["info"] info_text = info_text.replace("[at]", "@") info_text = info_text.replace(" ", "") info_text = info_text.replace("\n", "") email_text = [i[0] for i in re.findall(reEmail, info_text)] if email_text: l2 = sorted(set(email_text), key=email_text.index) # 处理同一个页面重复出现的邮箱地址 list_.extend(l2) print('#' * 20) else: print('*' * 20) list_1 = [] l3 = sorted(set(list_)) for item in l3: n = list_.count(item) if n > 3: list_1.append(item + "," + str(n)) print(list_1) print(len(list_1)) file = open("1.csv", "w", encoding="utf8") file.write("\n".join(list_1)) file.close() pass pass
def show_data(): """ :return: """ # s_sql = "select id, ne, exp_clear from teacher_eduexp where ne != '' and id < 1000" s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) update_list = [] num = 0 for teacher in teacher_list: exp_list = teacher["exp_clear"].split('\n') ne_list = eval(teacher["ne"]) if not ne_list: continue flag = 0 for i in range(0, len(ne_list)): if not ne_list[i]: continue degree = ne_list[i].get("degree", "") org = ne_list[i].get("org", "") if degree == "毕业" and org == "": flag = 1 print(ne_list[i]) if flag == 1: num += 1 print(num)
def get_email(): info_sql = "select id, info, homepage from teacherdata_info where id >= 40146 and email=''" info = dbs.getDics(info_sql) ins_dict = open(DIR + "\\dicts\\institution_email.txt", "r", encoding="utf-8").readlines() ins_dict = [ins.strip('\n') for ins in ins_dict] update_list = [] for item in info: if not item["info"]: continue if re.search(r'cksp\.eol\.cn', item["homepage"]) is not None: info_dict = eval(item["info"]) try: email_text = [i[0] for i in re.findall(reEmail, info_dict["E-mail"])] except: continue pass else: info_text = item["info"] info_text = info_text.replace("[at]", "@") info_text = info_text.replace(" ", "") info_text = info_text.replace("\n", "") email_text = [i[0] for i in re.findall(reEmail, info_text)] if email_text: list_email = sorted(set(email_text), key=email_text.index) # 去除相同邮箱地址 list_email = [item for item in list_email if item not in ins_dict] # 去除机构邮箱地址 if len(list_email) > 0: print(";".join(list_email)) update_list.append((";".join(list_email), item["id"])) print(len(update_list)) update_sql = "update teacherdata_info set email=%s where id = %s" print(dbs.exe_many(update_sql, update_list)) pass
def clear_1(): """ # 去除有工作描述的句子,没有工作经历的去除生日年份 # 去除与出版信息有关的句子 :return: """ s_sql = "select id, edu_exp from teacher_eduexp where type = 2" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) re_title = r'讲师|教授|指导|博士生导师|研究生导师|硕士生导师|从事|研究员|所长|院长|博导|硕导' re_job = r'任教|任|从事' re_publish = r'《|》|出版|学报|杂志' re_birth = r'[1-2][9,0][0-9]{2}生|出生|生于' update_list = [] num = 0 for teacher in teacher_list: lines = teacher["edu_exp"].split('\n') new_lines = [] for line in lines: line = re.sub(r'教 授', "教授", line) line = re.sub(r'讲 师', "讲师", line) line = re.sub(r'学 士', "学士", line) line = re.sub(r'博 士', "博士", line) line = re.sub(r'硕 士', "硕士", line) if re.findall(re_birth, line): continue if re.findall(re_job, line): continue if re.findall(re_title, line): if len(re.findall(r'学位|学士|硕士|博士|进修|硕博', line)) == 0: continue elif (len(re.findall(r'博士', line)) == 1 and len(re.findall(r'博士生导师', line)) == 1) or (len(re.findall(r'硕士', line)) == 1 and len(re.findall(r'硕士生导师', line)) == 1): continue if re.findall(re_publish, line) and len(re.findall(r'博士|硕士|学士|本科|研究生|访问学者|博士后|获|毕业|进修|学习|学位|直博|访问|MSW|硕博', line)) == 0: continue if re.findall(r'博士|硕士|学士|本科|研究生|访问学者|博士后|获|毕业|进修|学习|学位|直博|访问|MSW|硕博', line): new_lines.append(line) pass t1 = '\n'.join(lines) t2 = '\n'.join(new_lines) if t1 != t2: print(teacher["id"]) print(t1) print('-' * 10) print(t2) print('-' * 10) num += 1 if num % 1000 == 0: print() update_list.append(('\n'.join(new_lines), 2, teacher["id"])) print(num) print(len(update_list)) update_sql = "update teacher_eduexp set exp_clear = %s, clear=%s where id = %s"
def date2date(): """ 日期格式统一 2017年3月-2017年7月 [0-9\-年\.月-~~\—―/]{4,} :return: """ s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) update_list = [] num = 0 for teacher in teacher_list: exp_list = teacher["exp_clear"].split('\n') try: ne_list = eval(teacher["ne"]) except: print(teacher["id"]) if not ne_list: continue flag = 0 for i in range(0, len(ne_list)): if not ne_list[i]: continue date = ne_list[i].get("date", "") if re.findall(r'-|~|~|——|至', date): date = re.sub(r'-|~|~|——|至', '-', date) flag = 1 if re.findall(r'年', date): date = re.sub(r'年', '.', date) date = re.sub(r'月', '', date) date = re.sub(r'\.;', ';', date) date = date.strip('.') flag = 1 if re.findall(r'\.-', date): date = re.sub(r'\.-', '-', date) flag = 1 ne_list[i]["date"] = date if flag == 1: num += 1 print(ne_list) update_list.append((str(ne_list), teacher["id"])) print("-" * 10) print(num) print(len(update_list)) u_sql = "update teacher_eduexp set ne = %s where id = %s" print(dbs.exe_many(u_sql, update_list)) pass
def get_abroad(): school_dict = eval(open(".\\dicts\\school2en_dict.txt", "r", encoding='utf8').read()) abroad = {}.fromkeys(open(".\\dicts\\in.txt", "r", encoding='utf8').read().split('\n')) s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) update_list = [] num = 0 o_list = [] for teacher in teacher_list: try: ne_list = eval(teacher["ne"]) except: print(teacher["id"]) continue if not ne_list: continue flag = 0 for i in range(0, len(ne_list)): ne = ne_list[i] org_list = ne.get("org", "").split(';') for o in org_list: o = re.sub('大学.+?系', '大学', o) o = re.sub('大学.+?学院', '大学', o) if school_dict.get(o, "") == "" and re.findall('国|日本|澳大利亚|州|芬兰|瑞典|挪威|冰岛|丹麦|爱沙尼亚' '|拉脱维亚|立陶宛|白俄罗斯|俄罗斯|乌克兰|摩尔多瓦|波兰|捷克' '|斯洛伐克|匈牙利|德国|奥地利|瑞士|列支敦士登|英国|爱尔兰|荷兰' '|比利时|卢森堡|法国|摩纳哥|罗马尼亚|保加利亚|塞尔维亚|马其顿' '|阿尔巴尼亚|希腊|斯洛文尼亚|克罗地亚|波斯尼亚和墨塞哥维那' '|意大利|梵蒂冈|圣马力诺|马耳他|西班牙|葡萄牙|安道尔', o) \ and not re.findall('中国|首都|华东|华北|华南|华西|华中|西北|西南|东北|东南|北京|天津|上海|重庆|河北' '|山西|辽宁|吉林|黑龙江|江苏|浙江|安徽|福建|江西|山东|河南|湖北|湖南|广东|海南|四川' '|贵州|云南|陕西|甘肃|青海|台湾|内蒙|广西|西藏|宁夏|新疆|香港|澳门|石家庄|沈阳' '|哈尔滨|杭州|福州|济南|广州|武汉|成都|昆明|兰州|台北|南宁|银川|太原|长春|南京|合肥' '|南昌|郑州|长沙|海口|贵阳|西安|西宁|呼和浩特|拉萨|乌鲁木齐', o)\ or abroad.get(o, "") != "": flag = 1 break if flag == 1: break if flag == 1: num += 1 update_list.append((str(flag), teacher["id"])) print(num) print(len(update_list)) u_sql = "update teacher_edu_description set abroad = %s where id = %s" print(dbs.exe_many(u_sql, update_list))
def paper_seg(): import re import jieba jieba.load_userdict('.\\dicts\\user_dict_1.txt') term_dict = {}.fromkeys( open('.\\dicts\\term.txt', 'r', encoding='utf8').read().split('\n'), 'ok') # s_sql = "SELECT id, title, abstract FROM paper_data WHERE discipline='0812'" s_sql = "SELECT id, title, abstract FROM paper_data WHERE id in (632158,632690,633512,634259,634504,644862,645697,647947,651835,696197,697667,698942,701609,701882,702953,703279,719978,778166,781868,782636,785997,787785,788662,788852,789241,868144,869130,869391,869971,870955,871869,873702,877593,878509,878761,1057022,1069453,1070486,1085083,1085705,1086615,1088270,1096989,1328935,1329754,1330950,1333472,1336010,1376006,1379811,1382522,1384484,1519331,1538164,1591831,1912371,1913089,1913270,1915550,1916681,1921026,1921611,1922188,1923005,1923339,1923498,1924501,1925167,1934011,1935109,1942329,1947322,1951950,1955110,1978880,1983142,1986760,1987129,1989538,1990521,1991737,1995202,2023057,2030104,2032255,2032605,2039093,2043059,2045712,2051244,2064811,2090132,2090809,2091235,2102585,2103888)" data_list = dbs.getDics(s_sql) u_sql = "UPDATE paper_data SET word_seg=%s WHERE id=%s" u_list = [] fields_dict = dict() for data in data_list: title = data['title'] abstract = data['abstract'] word_list = jieba.cut(title + "\n" + abstract, HMM=False, cut_all=True) seg_dict = dict() for word in word_list: if term_dict.get(word, "") != "": c = seg_dict.get(word, 0) seg_dict[word] = c + 1 cc = fields_dict.get(word, 0) fields_dict[word] = cc + 1 # u_list.append((str(seg_dict), data['id'])) print(abstract) print((str(seg_dict), data['id'])) print("*" * 10) for w, k in fields_dict.items(): print("%s,%s\n" % (w, str(k))) # l = len(u_list) # if l == 10000: # print(dbs.exe_many(u_sql, u_list)) # u_list = [] # print(len(u_list)) # print(dbs.exe_many(u_sql, u_list)) pass
def get_name(): s_sql = "SELECT id, name, info FROM `teacherdata_info` LIMIT 1000;" teacher_data = dbs.getDics(s_sql) sum = 0 for teacher in teacher_data: info = teacher['info'] name = ex_name(info) if name == teacher['name']: sum += 1 print(True) else: print(False) print(len(teacher_data)) print(sum)
def clear_3(): """ 清洗ne字段:补全date,部分没有date的,实体识别将date识别为organization 如果date为空 :return: """ s_sql = "select id, ne, exp_clear from teacher_eduexp where ne != ''" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) update_list = [] for teacher in teacher_list: ne_list = eval(teacher["ne"]) flag = 0 for i in range(0, len(ne_list)): date = ne_list[i].get("date", "") org = ne_list[i].get("org", "") new_date_list = [] new_org_list = [] if date == "" and org != "" and re.findall(r'[1-2][9,0][0-9]{2}', org): org_list = org.split(';') for org in org_list: the_org_list = re.findall(r'[\u4E00-\u9FA5]+', org) if not the_org_list: new_org_list.append(org) continue new_org = the_org_list[0] if re.findall(r'^年', new_org): new_org = re.sub(r'^年', "", new_org) new_date = re.sub(new_org, "", org) new_date_list.append(new_date) new_org_list.append(new_org) if new_date_list: ne_list[i]["date"] = ";".join(new_date_list) ne_list[i]["org"] = ";".join(new_org_list) flag = 1 if flag == 1: print("-" * 10) print(teacher["id"]) print(eval(teacher["ne"])) print("-" * 3) print(ne_list) print("-" * 10) update_list.append((str(ne_list), teacher["id"])) u_sql = "update teacher_eduexp set ne = %s where id = %s" print(len(update_list))
def get_tf_df(): ''' tf:词在领域内出现的频次 df:词在领域内出现的文档数 :return: ''' stop_word_base = {}.fromkeys(open('.\\dicts\\stopword_base.txt', 'r', encoding='utf8').read().split('\n')) import jieba.posseg as pseg code_list = ["0801", "0802", "0803", "0804", "0805", "0806", "0807", "0808", "0809", "0810", "0811", "0812", "0813", "0814", "0815", "0816", "0817", "0818", "0819", "0820", "0821", "0822", "0823", "0824", "0825", "0826", "0827", "0828", "0829", "0830", "0831", "0832"] code_num = dict() for code in code_list: s_sql = "SELECT * FROM paper_data WHERE discipline=%s" % code data_list = dbs.getDics(s_sql) tf_dict = dict() df_dict = dict() flag = ["an", "j", "n", "nz", "vn"] for data in data_list: word_dict = dict() seg_list = pseg.cut(data["abstract"], HMM=True) for w in seg_list: if stop_word_base.get(w.word, "") == "" and w.flag in flag: c = tf_dict.get(w.word, 0) + 1 tf_dict[w.word] = c word_dict[w.word] = 1 for k, v in word_dict.items(): df = df_dict.get(k, 0) + v df_dict[k] = df fw = open(".\\test\\tf_df\\%s.csv" % code, "w", encoding="utf8") fw.write("term,tf,df\n") for k, v in tf_dict.items(): df = df_dict.get(k, 0) fw.write("%s,%s,%s\n" % (k, str(v), str(df))) fw.close() code_num[code] = len(data_list) print(code_num)
def get_institution(): """ 5.学院评价-institution 值 权重 一级重点学科 2 二级重点学科 1 无 0 :return: """ sql_initial = ''' UPDATE teacher_dis_code SET dis_rank = 0 ''' print(dbs.exe_sql(sql_initial)) s_sql = ''' SELECT teacher_dis_code.school, teacher_dis_code.discipline_code, discipline_school.`code` FROM `teacher_dis_code`,`discipline_school` WHERE teacher_dis_code.discipline_code != '' AND teacher_dis_code.discipline_code IS NOT NULL AND teacher_dis_code.discipline_code = discipline_school.root AND teacher_dis_code.school = discipline_school.school GROUP BY teacher_dis_code.school, teacher_dis_code.discipline_code, discipline_school.`code`; ''' data_list = dbs.getDics(s_sql) u_list = [] for data in data_list: if len(data['code']) == 4: u_list.append((5, data['school'], data['discipline_code'])) elif len(data['code']) == 6: u_list.append((1, data['school'], data['discipline_code'])) print(len(u_list)) u_sql = ''' UPDATE teacher_dis_code SET dis_rank = dis_rank + %s WHERE school=%s AND discipline_code=%s ''' print(dbs.exe_many(u_sql, u_list)) sql_initial_rank = ''' UPDATE teacher_rank, teacher_dis_code SET teacher_rank.institution = teacher_dis_code.dis_rank WHERE teacher_rank.teacher_id = teacher_dis_code.id; ''' print(dbs.exe_sql(sql_initial_rank))
def get_title(): title_dict = ["副教授", "助理教授", "教授", "讲师", "助教", "副研究员", "助理研究员", "研究员", "高级工程师", "高级实验师", "高工", "工程师", "实验师"] extractor = Extractor() result_list = [] select_sql = "SELECT id, name, info, all_link FROM `eds_985teacher` WHERE school = '清华大学';" teacher_list = dbs.getDics(select_sql) print(len(teacher_list)) for teacher in teacher_list: if re.search(r'cksp\.eol\.cn', teacher["all_link"]) is not None: info_dict = eval(teacher["info"]) try: extractor.set_text(info_dict["个人简介"]) except: person_info = teacher['info'] else: try: info = eval(teacher['info']) person_info = "".join(list(info.values())) except: person_info = teacher['info'] if person_info is None: continue extractor.set_text(person_info) re_list = [r'职称|职务', r'个人简介|个人简历', teacher["name"]] # 匹配模式 size = [50, 200, 200] extractor.sub() extractor.cut_blocks(re_list, size) index = 0 title = "" while index < len(title_dict): if title_dict[index] in extractor.text: if title_dict[index] == "副教授" and len(re.findall(r'教授', extractor.text)) > 1: title = "教授" break title = title_dict[index] break index += 1 if title != "": # result_list.append((title, teacher["id"])) print((teacher["id"], title))
def t(): import jieba.posseg as pseg import xlwt s_sql = "SELECT id, title, abstract FROM paper_data WHERE discipline='0812'" data_list = dbs.getDics(s_sql) flag_list = [ 'vd', 'g', 'h', 'f', 's', 'r', 'nt', 'm', 'ad', 'ns', 'zg', 'z', 'ag', 'q', 'yg', 'd', 'u', 'ul', 'j', 'a', 'y', 'ug', 'vg', 't', 'p', 'mq', 'uj', 'o', 'uv', 'k', 'c', 'nz', 'nrfg', 'tg', 'i' ] word_dict = dict() x_word_dict = dict() for data in data_list: title = data['title'] abstract = data['abstract'] word_list = pseg.cut(title + "\n" + abstract, HMM=True) for w, f in word_list: if f in flag_list: key = w + "--SPLIT--" + f c = word_dict.get(key, 0) word_dict[key] = c + 1 elif f == "x": key = w + "--SPLIT--" + f c = x_word_dict.get(key, 0) x_word_dict[key] = c + 1 pass wbk = xlwt.Workbook(encoding='utf-8') sheet = wbk.add_sheet('sheet1') row = 0 for k, v in word_dict.items(): word = k.split('--SPLIT--')[0] flag = k.split('--SPLIT--')[1] f = v sheet.write(row, 0, word) sheet.write(row, 1, flag) sheet.write(row, 2, f) row += 1 wbk.save('.\\test\\con_stop.xls') print(row)
def zhuanli_duplicate(): s_sql = "SELECT * FROM `pss_zhuanli` GROUP BY TIVIEW, INVIEW, APD" info_list = dbs.getDics(s_sql) update_list = [] for info in info_list: update_list.append( (info['TIVIEW'], info['INVIEW'], info['APD'], info['id'])) for i in update_list: print(i) print(len(update_list)) d_sql = ''' DELETE FROM `pss_zhuanli_copy` WHERE TIVIEW=%s AND INVIEW=%s AND APD=%s AND id !=%s ''' print(dbs.exe_many(d_sql, update_list))
def ne2sentence(): from algorithm.li.extract.templates.ne2sentence_template import sentence_template ne_name = ["org", "date", "degree", "country", "state_or_province", "major", "discipline_category", "graduate"] s_t = sentence_template s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) update_list = [] num = 0 for teacher in teacher_list: try: ne_list = eval(teacher["ne"]) except: print(teacher["id"]) continue if not ne_list: continue str_list = [] for i in range(0, len(ne_list)): ne = ne_list[i] t_l = [] for n in ne_name: t = ne_list[i].get(n, "") if t != "": t_l.append(n) s = ",".join(t_l) if s != "" and s not in s_t: s_t.append(s) if ne.get("degree", "") == "" or ne.get("degree", "") in ["学士", "硕士", "博士"]: r = nn(s, ne) if r != "": str_list.append(r) update_list.append(("\n".join(str_list), teacher["id"])) print(len(update_list)) u_sql = "update teacher set eduexp = %s where id = %s" print(dbs.exe_many(u_sql, update_list))
def mentor_extract(): s_sql = "SELECT * FROM `pss_zhuanli_copy`;" info_list = dbs.getDics(s_sql) mentor_dict = {}.fromkeys( open('.\\qinghua\\mentor_list.txt', 'r', encoding='utf-8').read().split('\n')) print(mentor_dict) print("*" * 10) update_list = [] for item in info_list: author_list = item['INVIEW'].split(';') mentor_list = [] for author in author_list: if mentor_dict.get(author, "") != "": mentor_list.append(author) print(author_list, mentor_list) update_list.append((";".join(mentor_list), item['id'])) u_sql = "UPDATE `pss_zhuanli_copy` SET MENTOR = %s WHERE id=%s" print(len(update_list)) print(dbs.exe_many(u_sql, update_list))
def clear_7(): """ :return: """ # s_sql = "select id, ne, exp_clear from teacher_eduexp where ne != '' and id < 1000" s_sql = "select id, ne, exp_clear from teacher_eduexp where ne != '' and type=1 and ok = 0 limit 1000" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) update_list = [] num = 0 for teacher in teacher_list: exp_list = teacher["exp_clear"].split('\n') ne_list = eval(teacher["ne"]) if not ne_list: continue flag = 1 print(teacher["id"]) print(teacher["exp_clear"]) for i in range(0, len(ne_list)): if not ne_list[i]: continue degree_list = ne_list[i].get("degree", "").split(';') org_list = ne_list[i].get("org", "").split(';') date_list = ne_list[i].get("date", "").split(';') degree_list = [i for i in degree_list if i != ""] org_list = [i for i in org_list if i != ""] date_list = [i for i in date_list if i != ""] org = ne_list[i].get("org", "") d = re.findall('[0-9\-年\.月-~~—/]{4,}', org) if len(d) == 1: exp = re.sub(r' ', '', exp_list[i]) da = re.findall(r'[0-9\-年\.月-~~\—―/]{4,}', exp) if len(da) > 1: # org = re.sub(r'[0-9\-年\.月-~~——/]{4,}', '', org) # ne_list[i]["org"] = org # ne_list[i]["date"] = da[0] print(da) print(exp) flag = 1
def f(): select_sql = "SELECT id, name, html FROM `eds_985teacher` WHERE school = '清华大学';" teacher_list = dbs.getDics(select_sql) print(len(teacher_list)) update_list = [] for teacher in teacher_list: if teacher["html"] is None or teacher["html"] == "": continue html = teacher["html"] html = re.sub(reTRIM_closing.format("style"), "", html) html = re.sub(reTRIM_closing.format("style".upper()), "", html) html = re.sub(reTRIM_closing.format("script"), "", html) html = re.sub(reTRIM_closing.format("script".upper()), "", html) html = re.sub(reTRIM_closing.format("head"), "", html) html = re.sub(reTRIM_closing.format("head".upper()), "", html) html = re.sub(reCOMM, "", html) for re_tag in inline_tags: html = re.sub(re_tag, "", html) name = re.sub('(', '(', teacher["name"]) name = re.sub(')', ')', name) name = re.sub('\(.*?\)', '', name) text_list = cut_blocks(html, re_list=[r'个人简介|个人简历', name]) if not text_list: continue text = "\n".join(text_list) if text: print(teacher["id"]) update_list.append((text, teacher["id"])) if len(update_list) == 1000: update_sql = "update eds_985teacher set info=%s where id=%s" print("插入……1000") print(dbs.exe_many(update_sql, update_list)) update_list = [] if update_list: update_sql = "update eds_985teacher set info=%s where id=%s" print("插入……%s" % len(update_list)) print(dbs.exe_many(update_sql, update_list)) pass
def clear_2(): """ 保留只包含学历信息的句子 :return: """ s_sql = "select id, edu_exp from teacher_eduexp where type = 2" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) re_title = r'讲师|教授|指导|博士生导师|研究生导师|硕士生导师|从事|研究员|所长|院长|博导|硕导' re_job = r'任教|任|从事|留校|留院' re_publish = r'《|》|出版|学报|杂志' re_birth = r'[1-2][9,0][0-9]{2}生|出生|生于|年生' update_list = [] num = 0 for teacher in teacher_list: if teacher["edu_exp"] == "" or teacher["edu_exp"] is None: continue lines = teacher["edu_exp"].split('\n') new_lines = [] for line in lines: if re.findall(re_title, line): continue if re.findall(re_job, line): continue if re.findall(re_publish, line): continue if re.findall(re_birth, line): continue new_lines.append(line) if new_lines: print(teacher["id"]) num += 1 update_list.append(('\n'.join(new_lines), 2, teacher["id"])) print(num) print(len(update_list)) update_sql = "update teacher_eduexp set exp_clear = %s, clear=%s where id = %s" print(dbs.exe_many(update_sql, update_list))
def get_school(): """ 4.学校评价-school 值 权重 985 2 211 1 非 0 :return: None """ sql_initial = ''' UPDATE teacher_rank SET school = 0 ''' print(dbs.exe_sql(sql_initial)) s_sql = ''' SELECT teacher.id as id, school_info.characteristic as characteristic FROM `teacher`, `school_info` WHERE teacher.school_id = school_info.id AND teacher.school_id != 0; ''' import re teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) u_list = [] for teacher in teacher_list: if teacher['characteristic'] is None or teacher['characteristic'] == "": continue if re.findall('985', teacher['characteristic']): u_list.append((2, teacher['id'])) elif re.findall('211', teacher['characteristic']): u_list.append((1, teacher['id'])) print(len(u_list)) u_sql = ''' UPDATE teacher_rank SET school=%s WHERE teacher_id=%s ''' print(dbs.exe_many(u_sql, u_list)) pass
def create_sheet(): import xlwt s_sql = "SELECT * FROM `pss_zhuanli_clean`;" info_list = dbs.getDics(s_sql) teacher_dict = {}.fromkeys( open('.\\qinghua\\teacher_list.txt', 'r', encoding='utf-8').read().split('\n')) wbk = xlwt.Workbook(encoding='utf-8') sheet = wbk.add_sheet('sheet1') row = 0 sum = 0 for info in info_list: author_list = info['INVIEW'].split(';') teacher_list = [] for author in author_list: if teacher_dict.get(author, "") != "": teacher_list.append(author) mentor_list = info['MENTOR'].split(';') sum += len(mentor_list) for mentor in mentor_list: if len(mentor_list) > 1: print("======") print(mentor, ";".join(teacher_list), info['TIVIEW'], info['PAVIEW'], info['APD'], info['PD']) sheet.write(row, 0, mentor) sheet.write(row, 1, ";".join(teacher_list)) sheet.write(row, 2, info['TIVIEW']) sheet.write(row, 3, info['PAVIEW']) sheet.write(row, 4, info['APD']) sheet.write(row, 5, info['PD']) row += 1 wbk.save('.\\qinghua\\清华院士_专利信息_2018.9.30.xls') print(sum)
def zhuanli_guanxi_extract(): import xlwt s_sql = "SELECT TEACHERS, TIVIEW FROM `pss_zhuanli_copy`;" info_list = dbs.getDics(s_sql) wbk = xlwt.Workbook(encoding='utf-8') sheet = wbk.add_sheet('sheet1') row = 0 for info in info_list: teacher_list = info['TEACHERS'].split(';') title = info['TIVIEW'] for i in range(0, len(teacher_list) - 1): for j in range(i + 1, len(teacher_list)): sheet.write(row, 0, teacher_list[i]) sheet.write(row, 1, teacher_list[j]) sheet.write(row, 2, title) row += 1 wbk.save('.\\qinghua\\材料学院专利合著信息.xls') print(row) pass
def check_word(): import re s_sql = "SELECT id, title, abstract FROM paper_data WHERE discipline='0812'" data_list = dbs.getDics(s_sql) for data in data_list: title = data['title'] # abstract = data['abstract'] abstract = "" if re.findall(r'半监督', title + abstract): print(data['id']) print(title + abstract) if re.findall(r'k近邻|K近邻', title + abstract): print(data['id']) print(title + abstract) if re.findall(r'k值|K值', title + abstract): print(data['id']) print(title + abstract) if re.findall(r'CNN|cnn', title + abstract): print(data['id']) print(title + abstract)