def get_paper_data(): begin = 0 end = 6000000 step = 10000 s_sql = ''' SELECT t1.id, t1.`name`, t1.abstract, t2.discipline_code FROM paper_clean1 t1, teacher_dis_code t2 WHERE t2.id = t1.author_id AND t2.discipline_code like '08%%' and t1.id > %s and t1.id <= %s; ''' u_sql = "INSERT paper_data(id, title, abstract, discipline) VALUES(%s, %s, %s, %s);" s = 0 while begin + step <= end: print(s_sql % (str(begin), str(begin + step))) data_list = dbs.getDics(s_sql % (str(begin), str(begin + step))) u_list = [] for data in data_list: if not (data["name"] and data["abstract"]): continue u_list.append((data["id"], data["name"], data["abstract"], data["discipline_code"])) if len(u_list) == 5000: print(dbs.exe_many(u_sql, u_list)) u_list = [] s += 5000 ll = len(u_list) s += ll print(dbs.exe_many(u_sql, u_list)) begin += step print(s)
def zhuanli_duplicate(): ''' 专利去重 :return: ''' s_sql = "SELECT * FROM `pss_zhuanli_copy` GROUP BY TIVIEW, INVIEW" save_list = dbs.getDics(s_sql) save_dict = dict() print(len(save_list)) for s in save_list: save_dict[str(s['id'])] = "1" s_sql = "SELECT * FROM `pss_zhuanli_copy`" delete_list = dbs.getDics(s_sql) id_list = [] for d in delete_list: if not save_dict.get(str(d['id'])): id_list.append(d['id']) print(len(id_list)) d_sql = ''' DELETE FROM `pss_zhuanli_copy` WHERE id =%s ''' print(dbs.exe_many(d_sql, id_list))
def data_clean(): """ 将javascript的链接转换为正常 :return: """ data_list = dbs.getDics( "SELECT * FROM `eds_985teacher` WHERE link like '%javascript%' AND school = '中南大学';" ) print(len(data_list)) u_list = [] for data in data_list: id = data['id'] '''javascript:window.open('/blog/content2?name='+encodeURI('周雄伟'))''' link = data['link'] if link != "": p_tuple = re.findall(r"open\('(.+?)'\+encodeURI\('(.+?)'\)\)", link)[0] link = p_tuple[0] + pa.quote(p_tuple[1]) # print(pa.urljoin(data['institution_url'], link)) link = pa.urljoin(data['institution_url'], link) print(link) u_list.append((link, id)) print(len(u_list)) u_sql = "UPDATE eds_985teacher SET all_link=%s WHERE id = %s" print(dbs.exe_many(u_sql, u_list))
def get_email(): info_sql = "select id, info, homepage from teacherdata_info where id >= 40146 and email=''" info = dbs.getDics(info_sql) ins_dict = open(DIR + "\\dicts\\institution_email.txt", "r", encoding="utf-8").readlines() ins_dict = [ins.strip('\n') for ins in ins_dict] update_list = [] for item in info: if not item["info"]: continue if re.search(r'cksp\.eol\.cn', item["homepage"]) is not None: info_dict = eval(item["info"]) try: email_text = [i[0] for i in re.findall(reEmail, info_dict["E-mail"])] except: continue pass else: info_text = item["info"] info_text = info_text.replace("[at]", "@") info_text = info_text.replace(" ", "") info_text = info_text.replace("\n", "") email_text = [i[0] for i in re.findall(reEmail, info_text)] if email_text: list_email = sorted(set(email_text), key=email_text.index) # 去除相同邮箱地址 list_email = [item for item in list_email if item not in ins_dict] # 去除机构邮箱地址 if len(list_email) > 0: print(";".join(list_email)) update_list.append((";".join(list_email), item["id"])) print(len(update_list)) update_sql = "update teacherdata_info set email=%s where id = %s" print(dbs.exe_many(update_sql, update_list)) pass
def t_(): id_list = open('.\\qinghua\\id_list.txt', 'r', encoding='utf-8').read().split('\n') print(len(id_list)) u_sql = "UPDATE zhuanli_search SET status=0 where id = %s" print(dbs.exe_many(u_sql, id_list)) pass
def get_edu_exp(): select_sql = "select id, info_clear from teacher_eduexp where type = 0" teacher_list = dbs.getDics(select_sql) print(len(teacher_list)) ta = TextAttribute() num = 0 update_list = [] for teacher in teacher_list: if teacher["info_clear"] is None or teacher["info_clear"] == "": continue # print(teacher["id"]) ta.set_text(teacher["info_clear"]) ta.seg_sentence("\n") ta.compute_gravity() t, edu_items = ta.get_edu_items() if edu_items: print(teacher["id"]) print(t, edu_items) num += 1 update_list.append(("\n".join(edu_items), t, teacher["id"])) continue # ta.get_edu_long_item() print(num) print(len(update_list)) update_sql = "update teacher_eduexp set edu_exp=%s, type=%s where id = %s" print(dbs.exe_many(update_sql, update_list))
def date2date(): """ 日期格式统一 2017年3月-2017年7月 [0-9\-年\.月-~~\—―/]{4,} :return: """ s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) update_list = [] num = 0 for teacher in teacher_list: exp_list = teacher["exp_clear"].split('\n') try: ne_list = eval(teacher["ne"]) except: print(teacher["id"]) if not ne_list: continue flag = 0 for i in range(0, len(ne_list)): if not ne_list[i]: continue date = ne_list[i].get("date", "") if re.findall(r'-|~|~|——|至', date): date = re.sub(r'-|~|~|——|至', '-', date) flag = 1 if re.findall(r'年', date): date = re.sub(r'年', '.', date) date = re.sub(r'月', '', date) date = re.sub(r'\.;', ';', date) date = date.strip('.') flag = 1 if re.findall(r'\.-', date): date = re.sub(r'\.-', '-', date) flag = 1 ne_list[i]["date"] = date if flag == 1: num += 1 print(ne_list) update_list.append((str(ne_list), teacher["id"])) print("-" * 10) print(num) print(len(update_list)) u_sql = "update teacher_eduexp set ne = %s where id = %s" print(dbs.exe_many(u_sql, update_list)) pass
def f(): select_sql = "SELECT id, name, html FROM `eds_985teacher` WHERE school = '清华大学';" teacher_list = dbs.getDics(select_sql) print(len(teacher_list)) update_list = [] for teacher in teacher_list: if teacher["html"] is None or teacher["html"] == "": continue html = teacher["html"] html = re.sub(reTRIM_closing.format("style"), "", html) html = re.sub(reTRIM_closing.format("style".upper()), "", html) html = re.sub(reTRIM_closing.format("script"), "", html) html = re.sub(reTRIM_closing.format("script".upper()), "", html) html = re.sub(reTRIM_closing.format("head"), "", html) html = re.sub(reTRIM_closing.format("head".upper()), "", html) html = re.sub(reCOMM, "", html) for re_tag in inline_tags: html = re.sub(re_tag, "", html) name = re.sub('(', '(', teacher["name"]) name = re.sub(')', ')', name) name = re.sub('\(.*?\)', '', name) text_list = cut_blocks(html, re_list=[r'个人简介|个人简历', name]) if not text_list: continue text = "\n".join(text_list) if text: print(teacher["id"]) update_list.append((text, teacher["id"])) if len(update_list) == 1000: update_sql = "update eds_985teacher set info=%s where id=%s" print("插入……1000") print(dbs.exe_many(update_sql, update_list)) update_list = [] if update_list: update_sql = "update eds_985teacher set info=%s where id=%s" print("插入……%s" % len(update_list)) print(dbs.exe_many(update_sql, update_list)) pass
def get_abroad(): school_dict = eval(open(".\\dicts\\school2en_dict.txt", "r", encoding='utf8').read()) abroad = {}.fromkeys(open(".\\dicts\\in.txt", "r", encoding='utf8').read().split('\n')) s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) update_list = [] num = 0 o_list = [] for teacher in teacher_list: try: ne_list = eval(teacher["ne"]) except: print(teacher["id"]) continue if not ne_list: continue flag = 0 for i in range(0, len(ne_list)): ne = ne_list[i] org_list = ne.get("org", "").split(';') for o in org_list: o = re.sub('大学.+?系', '大学', o) o = re.sub('大学.+?学院', '大学', o) if school_dict.get(o, "") == "" and re.findall('国|日本|澳大利亚|州|芬兰|瑞典|挪威|冰岛|丹麦|爱沙尼亚' '|拉脱维亚|立陶宛|白俄罗斯|俄罗斯|乌克兰|摩尔多瓦|波兰|捷克' '|斯洛伐克|匈牙利|德国|奥地利|瑞士|列支敦士登|英国|爱尔兰|荷兰' '|比利时|卢森堡|法国|摩纳哥|罗马尼亚|保加利亚|塞尔维亚|马其顿' '|阿尔巴尼亚|希腊|斯洛文尼亚|克罗地亚|波斯尼亚和墨塞哥维那' '|意大利|梵蒂冈|圣马力诺|马耳他|西班牙|葡萄牙|安道尔', o) \ and not re.findall('中国|首都|华东|华北|华南|华西|华中|西北|西南|东北|东南|北京|天津|上海|重庆|河北' '|山西|辽宁|吉林|黑龙江|江苏|浙江|安徽|福建|江西|山东|河南|湖北|湖南|广东|海南|四川' '|贵州|云南|陕西|甘肃|青海|台湾|内蒙|广西|西藏|宁夏|新疆|香港|澳门|石家庄|沈阳' '|哈尔滨|杭州|福州|济南|广州|武汉|成都|昆明|兰州|台北|南宁|银川|太原|长春|南京|合肥' '|南昌|郑州|长沙|海口|贵阳|西安|西宁|呼和浩特|拉萨|乌鲁木齐', o)\ or abroad.get(o, "") != "": flag = 1 break if flag == 1: break if flag == 1: num += 1 update_list.append((str(flag), teacher["id"])) print(num) print(len(update_list)) u_sql = "update teacher_edu_description set abroad = %s where id = %s" print(dbs.exe_many(u_sql, update_list))
def get_institution(): """ 5.学院评价-institution 值 权重 一级重点学科 2 二级重点学科 1 无 0 :return: """ sql_initial = ''' UPDATE teacher_dis_code SET dis_rank = 0 ''' print(dbs.exe_sql(sql_initial)) s_sql = ''' SELECT teacher_dis_code.school, teacher_dis_code.discipline_code, discipline_school.`code` FROM `teacher_dis_code`,`discipline_school` WHERE teacher_dis_code.discipline_code != '' AND teacher_dis_code.discipline_code IS NOT NULL AND teacher_dis_code.discipline_code = discipline_school.root AND teacher_dis_code.school = discipline_school.school GROUP BY teacher_dis_code.school, teacher_dis_code.discipline_code, discipline_school.`code`; ''' data_list = dbs.getDics(s_sql) u_list = [] for data in data_list: if len(data['code']) == 4: u_list.append((5, data['school'], data['discipline_code'])) elif len(data['code']) == 6: u_list.append((1, data['school'], data['discipline_code'])) print(len(u_list)) u_sql = ''' UPDATE teacher_dis_code SET dis_rank = dis_rank + %s WHERE school=%s AND discipline_code=%s ''' print(dbs.exe_many(u_sql, u_list)) sql_initial_rank = ''' UPDATE teacher_rank, teacher_dis_code SET teacher_rank.institution = teacher_dis_code.dis_rank WHERE teacher_rank.teacher_id = teacher_dis_code.id; ''' print(dbs.exe_sql(sql_initial_rank))
def zhuanli_duplicate(): s_sql = "SELECT * FROM `pss_zhuanli` GROUP BY TIVIEW, INVIEW, APD" info_list = dbs.getDics(s_sql) update_list = [] for info in info_list: update_list.append( (info['TIVIEW'], info['INVIEW'], info['APD'], info['id'])) for i in update_list: print(i) print(len(update_list)) d_sql = ''' DELETE FROM `pss_zhuanli_copy` WHERE TIVIEW=%s AND INVIEW=%s AND APD=%s AND id !=%s ''' print(dbs.exe_many(d_sql, update_list))
def ne2sentence(): from algorithm.li.extract.templates.ne2sentence_template import sentence_template ne_name = ["org", "date", "degree", "country", "state_or_province", "major", "discipline_category", "graduate"] s_t = sentence_template s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) update_list = [] num = 0 for teacher in teacher_list: try: ne_list = eval(teacher["ne"]) except: print(teacher["id"]) continue if not ne_list: continue str_list = [] for i in range(0, len(ne_list)): ne = ne_list[i] t_l = [] for n in ne_name: t = ne_list[i].get(n, "") if t != "": t_l.append(n) s = ",".join(t_l) if s != "" and s not in s_t: s_t.append(s) if ne.get("degree", "") == "" or ne.get("degree", "") in ["学士", "硕士", "博士"]: r = nn(s, ne) if r != "": str_list.append(r) update_list.append(("\n".join(str_list), teacher["id"])) print(len(update_list)) u_sql = "update teacher set eduexp = %s where id = %s" print(dbs.exe_many(u_sql, update_list))
def mentor_extract(): s_sql = "SELECT * FROM `pss_zhuanli_copy`;" info_list = dbs.getDics(s_sql) mentor_dict = {}.fromkeys( open('.\\qinghua\\mentor_list.txt', 'r', encoding='utf-8').read().split('\n')) print(mentor_dict) print("*" * 10) update_list = [] for item in info_list: author_list = item['INVIEW'].split(';') mentor_list = [] for author in author_list: if mentor_dict.get(author, "") != "": mentor_list.append(author) print(author_list, mentor_list) update_list.append((";".join(mentor_list), item['id'])) u_sql = "UPDATE `pss_zhuanli_copy` SET MENTOR = %s WHERE id=%s" print(len(update_list)) print(dbs.exe_many(u_sql, update_list))
def clear_2(): """ 保留只包含学历信息的句子 :return: """ s_sql = "select id, edu_exp from teacher_eduexp where type = 2" teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) re_title = r'讲师|教授|指导|博士生导师|研究生导师|硕士生导师|从事|研究员|所长|院长|博导|硕导' re_job = r'任教|任|从事|留校|留院' re_publish = r'《|》|出版|学报|杂志' re_birth = r'[1-2][9,0][0-9]{2}生|出生|生于|年生' update_list = [] num = 0 for teacher in teacher_list: if teacher["edu_exp"] == "" or teacher["edu_exp"] is None: continue lines = teacher["edu_exp"].split('\n') new_lines = [] for line in lines: if re.findall(re_title, line): continue if re.findall(re_job, line): continue if re.findall(re_publish, line): continue if re.findall(re_birth, line): continue new_lines.append(line) if new_lines: print(teacher["id"]) num += 1 update_list.append(('\n'.join(new_lines), 2, teacher["id"])) print(num) print(len(update_list)) update_sql = "update teacher_eduexp set exp_clear = %s, clear=%s where id = %s" print(dbs.exe_many(update_sql, update_list))
def get_total(): """ 1:各维度归一化后相加 2:非985/211的总分为0, 985学校总分*2 3:重点实验室 :return: """ import pandas as pd s_sql = ''' SELECT teacher_id, age, integrity, title, school, institution, main_lab, abroad FROM teacher_rank ''' df = dbs.get_teacher_dataframe(s_sql) df_id = df['teacher_id'] df_normal = df / df.max() df_normal['total'] = df_normal['age'] + df_normal['integrity'] + df_normal[ 'title'] + df_normal['school'] + df_normal['institution'] + df_normal[ 'main_lab'] + df_normal['abroad'] u_list = [] for index in df_normal.index: total = df_normal.loc[index].total id = df_id[index] # 非985/211的总分为0, 985学校总分*2 if df_normal.loc[index].school == 0.0: total = 0 elif df_normal.loc[index].school == 1.0: total = total + df_normal.loc[index].total # 重点实验室 total = total + df_normal.loc[index].total * df_normal.loc[ index].main_lab u_list.append((float(total), int(id))) print(len(u_list)) u_sql = 'UPDATE teacher_rank SET total = %s WHERE teacher_id = %s' print(dbs.exe_many(u_sql, u_list)) pass
def get_school(): """ 4.学校评价-school 值 权重 985 2 211 1 非 0 :return: None """ sql_initial = ''' UPDATE teacher_rank SET school = 0 ''' print(dbs.exe_sql(sql_initial)) s_sql = ''' SELECT teacher.id as id, school_info.characteristic as characteristic FROM `teacher`, `school_info` WHERE teacher.school_id = school_info.id AND teacher.school_id != 0; ''' import re teacher_list = dbs.getDics(s_sql) print(len(teacher_list)) u_list = [] for teacher in teacher_list: if teacher['characteristic'] is None or teacher['characteristic'] == "": continue if re.findall('985', teacher['characteristic']): u_list.append((2, teacher['id'])) elif re.findall('211', teacher['characteristic']): u_list.append((1, teacher['id'])) print(len(u_list)) u_sql = ''' UPDATE teacher_rank SET school=%s WHERE teacher_id=%s ''' print(dbs.exe_many(u_sql, u_list)) pass
def teacher_extract(): s_sql = "SELECT * FROM `pss_zhuanli_copy`;" info_list = dbs.getDics(s_sql) teacher_dict = {}.fromkeys( open('C:\\Users\\Administrator\\Desktop\\teacher.txt', 'r', encoding='utf-8').read().split('\n')) print(teacher_dict) print("*" * 10) update_list = [] for item in info_list: author_list = item['INVIEW'].split(';') teacher_list = [] for author in author_list: if teacher_dict.get(author, "") != "": teacher_list.append(author) print(author_list, teacher_list) update_list.append((";".join(teacher_list), item['id'])) u_sql = "UPDATE `pss_zhuanli_copy` SET TEACHERS = %s WHERE id=%s" print(len(update_list)) print(dbs.exe_many(u_sql, update_list))
def get_age(): select_sql = "SELECT * from teacherdata_info" teacherdata = dbs.getDics(select_sql) print(len(teacherdata)) uplist = [] extractor = Extractor() for teacher in teacherdata: if not teacher['homepage'].find('http://ckspkk.eol.cn') == -1: info = eval(teacher['info']) # print(info) birthday = info.get('出生年月', '') person_info = info.get('个人简介', None) # ---有出生日期 if len(birthday) > 3 and birthday != "": updata = (birthday[:4] + '-出生', teacher['id']) uplist.append(updata) # ---没有出生日期 else: if person_info is None: continue extractor.set_text(person_info) birthyear = extractor.get_birthday() updata = (birthyear, teacher['id']) uplist.append(updata) else: try: info = eval(teacher['info']) person_info = "".join(list(info.values())) except Exception as e: person_info = teacher['info'] if person_info is None: continue extractor.set_text(person_info) birthyear = extractor.get_birthday() updata = (birthyear, teacher['id']) uplist.append(updata) uplistNew = [] for node in uplist: age_description = node[0] age = 0 if age_description == '': age = 0 else: year = int(age_description.split('-')[0]) year_type = age_description.split('-')[1] if year_type == '出生': age = 2018 - year elif year_type == '学士': age = 2018 - year + 22 elif year_type == '硕士': age = 2018 - year + 25 elif year_type == '博士': age = 2018 - year + 30 if age > 100 or age < 20: age = 0 age_description = '' data = (age_description, age, node[1]) uplistNew.append(data) print(len(uplistNew)) update_sql = "update teacher_age set age_description=%s,age=%s where id=%s" print(dbs.exe_many(update_sql, li=uplistNew))