Esempio n. 1
0
def get_paper_data():
    begin = 0
    end = 6000000
    step = 10000
    s_sql = '''
        SELECT t1.id, t1.`name`, t1.abstract, t2.discipline_code 
        FROM paper_clean1 t1, teacher_dis_code t2 
        WHERE t2.id = t1.author_id AND t2.discipline_code like '08%%' and t1.id > %s and t1.id <= %s;
    '''
    u_sql = "INSERT paper_data(id, title, abstract, discipline) VALUES(%s, %s, %s, %s);"
    s = 0
    while begin + step <= end:
        print(s_sql % (str(begin), str(begin + step)))
        data_list = dbs.getDics(s_sql % (str(begin), str(begin + step)))

        u_list = []
        for data in data_list:
            if not (data["name"] and data["abstract"]):
                continue

            u_list.append((data["id"], data["name"], data["abstract"],
                           data["discipline_code"]))
            if len(u_list) == 5000:
                print(dbs.exe_many(u_sql, u_list))
                u_list = []
                s += 5000

        ll = len(u_list)
        s += ll
        print(dbs.exe_many(u_sql, u_list))
        begin += step

    print(s)
Esempio n. 2
0
File: test.py Progetto: fengges/eds
def zhuanli_duplicate():
    '''
    专利去重
    :return:
    '''
    s_sql = "SELECT * FROM `pss_zhuanli_copy` GROUP BY TIVIEW, INVIEW"
    save_list = dbs.getDics(s_sql)

    save_dict = dict()
    print(len(save_list))
    for s in save_list:
        save_dict[str(s['id'])] = "1"

    s_sql = "SELECT * FROM `pss_zhuanli_copy`"
    delete_list = dbs.getDics(s_sql)

    id_list = []

    for d in delete_list:
        if not save_dict.get(str(d['id'])):
            id_list.append(d['id'])

    print(len(id_list))
    d_sql = '''
            DELETE FROM `pss_zhuanli_copy`
            WHERE id =%s
    '''
    print(dbs.exe_many(d_sql, id_list))
Esempio n. 3
0
File: test.py Progetto: fengges/eds
def data_clean():
    """
    将javascript的链接转换为正常
    :return:
    """
    data_list = dbs.getDics(
        "SELECT * FROM `eds_985teacher` WHERE link like '%javascript%' AND school = '中南大学';"
    )
    print(len(data_list))
    u_list = []
    for data in data_list:
        id = data['id']
        '''javascript:window.open('/blog/content2?name='+encodeURI('周雄伟'))'''
        link = data['link']
        if link != "":
            p_tuple = re.findall(r"open\('(.+?)'\+encodeURI\('(.+?)'\)\)",
                                 link)[0]

            link = p_tuple[0] + pa.quote(p_tuple[1])
            # print(pa.urljoin(data['institution_url'], link))
            link = pa.urljoin(data['institution_url'], link)
            print(link)
            u_list.append((link, id))

    print(len(u_list))
    u_sql = "UPDATE eds_985teacher SET all_link=%s WHERE id = %s"
    print(dbs.exe_many(u_sql, u_list))
Esempio n. 4
0
def get_email():
    info_sql = "select id, info, homepage from teacherdata_info where id >= 40146 and email=''"
    info = dbs.getDics(info_sql)

    ins_dict = open(DIR + "\\dicts\\institution_email.txt", "r", encoding="utf-8").readlines()
    ins_dict = [ins.strip('\n') for ins in ins_dict]

    update_list = []
    for item in info:
        if not item["info"]:
            continue
        if re.search(r'cksp\.eol\.cn', item["homepage"]) is not None:
            info_dict = eval(item["info"])
            try:
                email_text = [i[0] for i in re.findall(reEmail, info_dict["E-mail"])]
            except:
                continue
            pass
        else:
            info_text = item["info"]
            info_text = info_text.replace("[at]", "@")
            info_text = info_text.replace(" ", "")
            info_text = info_text.replace("\n", "")
            email_text = [i[0] for i in re.findall(reEmail, info_text)]

        if email_text:
            list_email = sorted(set(email_text), key=email_text.index)  # 去除相同邮箱地址
            list_email = [item for item in list_email if item not in ins_dict]  # 去除机构邮箱地址
            if len(list_email) > 0:
                print(";".join(list_email))
                update_list.append((";".join(list_email), item["id"]))
    print(len(update_list))
    update_sql = "update teacherdata_info set email=%s where id = %s"
    print(dbs.exe_many(update_sql, update_list))
    pass
Esempio n. 5
0
File: test.py Progetto: fengges/eds
def t_():
    id_list = open('.\\qinghua\\id_list.txt', 'r',
                   encoding='utf-8').read().split('\n')
    print(len(id_list))
    u_sql = "UPDATE zhuanli_search SET status=0 where id = %s"
    print(dbs.exe_many(u_sql, id_list))
    pass
Esempio n. 6
0
def get_edu_exp():
    select_sql = "select id, info_clear from teacher_eduexp where type = 0"

    teacher_list = dbs.getDics(select_sql)
    print(len(teacher_list))
    ta = TextAttribute()

    num = 0
    update_list = []
    for teacher in teacher_list:
        if teacher["info_clear"] is None or teacher["info_clear"] == "":
            continue
        # print(teacher["id"])
        ta.set_text(teacher["info_clear"])

        ta.seg_sentence("\n")
        ta.compute_gravity()
        t, edu_items = ta.get_edu_items()
        if edu_items:
            print(teacher["id"])
            print(t, edu_items)
            num += 1
            update_list.append(("\n".join(edu_items), t, teacher["id"]))
            continue
        # ta.get_edu_long_item()

    print(num)
    print(len(update_list))
    update_sql = "update teacher_eduexp set edu_exp=%s, type=%s where id = %s"
    print(dbs.exe_many(update_sql, update_list))
Esempio n. 7
0
def date2date():
    """
    日期格式统一
    2017年3月-2017年7月
    [0-9\-年\.月-~~\—―/]{4,}
    :return:
    """
    s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    update_list = []
    num = 0
    for teacher in teacher_list:
        exp_list = teacher["exp_clear"].split('\n')
        try:
            ne_list = eval(teacher["ne"])
        except:
            print(teacher["id"])
        if not ne_list:
            continue
        flag = 0
        for i in range(0, len(ne_list)):
            if not ne_list[i]:
                continue
            date = ne_list[i].get("date", "")
            if re.findall(r'-|~|~|——|至', date):
                date = re.sub(r'-|~|~|——|至', '-', date)

                flag = 1

            if re.findall(r'年', date):
                date = re.sub(r'年', '.', date)
                date = re.sub(r'月', '', date)
                date = re.sub(r'\.;', ';', date)
                date = date.strip('.')
                flag = 1

            if re.findall(r'\.-', date):
                date = re.sub(r'\.-', '-', date)
                flag = 1
            ne_list[i]["date"] = date

        if flag == 1:
            num += 1
            print(ne_list)
            update_list.append((str(ne_list), teacher["id"]))

    print("-" * 10)
    print(num)
    print(len(update_list))
    u_sql = "update teacher_eduexp set ne = %s where id = %s"
    print(dbs.exe_many(u_sql, update_list))
    pass
Esempio n. 8
0
def f():

    select_sql = "SELECT id, name, html FROM `eds_985teacher` WHERE school = '清华大学';"
    teacher_list = dbs.getDics(select_sql)
    print(len(teacher_list))
    update_list = []
    for teacher in teacher_list:
        if teacher["html"] is None or teacher["html"] == "":
            continue
        html = teacher["html"]
        html = re.sub(reTRIM_closing.format("style"), "", html)
        html = re.sub(reTRIM_closing.format("style".upper()), "", html)
        html = re.sub(reTRIM_closing.format("script"), "", html)
        html = re.sub(reTRIM_closing.format("script".upper()), "", html)
        html = re.sub(reTRIM_closing.format("head"), "", html)
        html = re.sub(reTRIM_closing.format("head".upper()), "", html)
        html = re.sub(reCOMM, "", html)
        for re_tag in inline_tags:
            html = re.sub(re_tag, "", html)

        name = re.sub('(', '(', teacher["name"])
        name = re.sub(')', ')', name)
        name = re.sub('\(.*?\)', '', name)
        text_list = cut_blocks(html, re_list=[r'个人简介|个人简历', name])
        if not text_list:
            continue
        text = "\n".join(text_list)
        if text:
            print(teacher["id"])
            update_list.append((text, teacher["id"]))
        if len(update_list) == 1000:
            update_sql = "update eds_985teacher set info=%s where id=%s"
            print("插入……1000")
            print(dbs.exe_many(update_sql, update_list))
            update_list = []
    if update_list:
        update_sql = "update eds_985teacher set info=%s where id=%s"
        print("插入……%s" % len(update_list))
        print(dbs.exe_many(update_sql, update_list))
    pass
Esempio n. 9
0
def get_abroad():
    school_dict = eval(open(".\\dicts\\school2en_dict.txt", "r", encoding='utf8').read())

    abroad = {}.fromkeys(open(".\\dicts\\in.txt", "r", encoding='utf8').read().split('\n'))
    s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    update_list = []
    num = 0
    o_list = []
    for teacher in teacher_list:
        try:
            ne_list = eval(teacher["ne"])
        except:
            print(teacher["id"])
            continue
        if not ne_list:
            continue
        flag = 0
        for i in range(0, len(ne_list)):

            ne = ne_list[i]

            org_list = ne.get("org", "").split(';')
            for o in org_list:
                o = re.sub('大学.+?系', '大学', o)
                o = re.sub('大学.+?学院', '大学', o)
                if school_dict.get(o, "") == "" and re.findall('国|日本|澳大利亚|州|芬兰|瑞典|挪威|冰岛|丹麦|爱沙尼亚'
                                                               '|拉脱维亚|立陶宛|白俄罗斯|俄罗斯|乌克兰|摩尔多瓦|波兰|捷克'
                                                               '|斯洛伐克|匈牙利|德国|奥地利|瑞士|列支敦士登|英国|爱尔兰|荷兰'
                                                               '|比利时|卢森堡|法国|摩纳哥|罗马尼亚|保加利亚|塞尔维亚|马其顿'
                                                               '|阿尔巴尼亚|希腊|斯洛文尼亚|克罗地亚|波斯尼亚和墨塞哥维那'
                                                               '|意大利|梵蒂冈|圣马力诺|马耳他|西班牙|葡萄牙|安道尔', o) \
                        and not re.findall('中国|首都|华东|华北|华南|华西|华中|西北|西南|东北|东南|北京|天津|上海|重庆|河北'
                                           '|山西|辽宁|吉林|黑龙江|江苏|浙江|安徽|福建|江西|山东|河南|湖北|湖南|广东|海南|四川'
                                           '|贵州|云南|陕西|甘肃|青海|台湾|内蒙|广西|西藏|宁夏|新疆|香港|澳门|石家庄|沈阳'
                                           '|哈尔滨|杭州|福州|济南|广州|武汉|成都|昆明|兰州|台北|南宁|银川|太原|长春|南京|合肥'
                                           '|南昌|郑州|长沙|海口|贵阳|西安|西宁|呼和浩特|拉萨|乌鲁木齐', o)\
                        or abroad.get(o, "") != "":
                    flag = 1
                    break
            if flag == 1:
                break
        if flag == 1:
            num += 1
            update_list.append((str(flag), teacher["id"]))

    print(num)

    print(len(update_list))
    u_sql = "update teacher_edu_description set abroad = %s where id = %s"
    print(dbs.exe_many(u_sql, update_list))
Esempio n. 10
0
def get_institution():
    """
    5.学院评价-institution
    值				权重
    一级重点学科	    2
    二级重点学科	    1
    无				0
    :return:
    """
    sql_initial = '''
            UPDATE teacher_dis_code
            SET dis_rank = 0
        '''
    print(dbs.exe_sql(sql_initial))

    s_sql = '''
        SELECT teacher_dis_code.school, teacher_dis_code.discipline_code, discipline_school.`code`
        FROM `teacher_dis_code`,`discipline_school` 
        WHERE teacher_dis_code.discipline_code != '' 
        AND teacher_dis_code.discipline_code IS NOT NULL 
        AND teacher_dis_code.discipline_code = discipline_school.root
        AND teacher_dis_code.school = discipline_school.school
        GROUP BY teacher_dis_code.school, teacher_dis_code.discipline_code, discipline_school.`code`;
    '''
    data_list = dbs.getDics(s_sql)
    u_list = []
    for data in data_list:
        if len(data['code']) == 4:
            u_list.append((5, data['school'], data['discipline_code']))
        elif len(data['code']) == 6:
            u_list.append((1, data['school'], data['discipline_code']))

    print(len(u_list))
    u_sql = '''
        UPDATE teacher_dis_code
        SET dis_rank = dis_rank + %s
        WHERE school=%s AND discipline_code=%s
    '''
    print(dbs.exe_many(u_sql, u_list))

    sql_initial_rank = '''
        UPDATE teacher_rank, teacher_dis_code
        SET teacher_rank.institution = teacher_dis_code.dis_rank
        WHERE teacher_rank.teacher_id = teacher_dis_code.id;
    '''
    print(dbs.exe_sql(sql_initial_rank))
Esempio n. 11
0
File: test.py Progetto: haha8x/eds
def zhuanli_duplicate():
    s_sql = "SELECT * FROM `pss_zhuanli` GROUP BY TIVIEW, INVIEW, APD"
    info_list = dbs.getDics(s_sql)

    update_list = []
    for info in info_list:
        update_list.append(
            (info['TIVIEW'], info['INVIEW'], info['APD'], info['id']))

    for i in update_list:
        print(i)
    print(len(update_list))
    d_sql = '''
            DELETE FROM `pss_zhuanli_copy`
            WHERE TIVIEW=%s
            AND INVIEW=%s
            AND APD=%s
            AND id !=%s
    '''
    print(dbs.exe_many(d_sql, update_list))
Esempio n. 12
0
def ne2sentence():
    from algorithm.li.extract.templates.ne2sentence_template import sentence_template
    ne_name = ["org", "date", "degree", "country", "state_or_province", "major", "discipline_category", "graduate"]
    s_t = sentence_template
    s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    update_list = []
    num = 0
    for teacher in teacher_list:
        try:
            ne_list = eval(teacher["ne"])
        except:
            print(teacher["id"])
            continue
        if not ne_list:
            continue
        str_list = []
        for i in range(0, len(ne_list)):

            ne = ne_list[i]

            t_l = []
            for n in ne_name:
                t = ne_list[i].get(n, "")
                if t != "":
                    t_l.append(n)
            s = ",".join(t_l)
            if s != "" and s not in s_t:
                s_t.append(s)

            if ne.get("degree", "") == "" or ne.get("degree", "") in ["学士", "硕士", "博士"]:
                r = nn(s, ne)
                if r != "":
                    str_list.append(r)

        update_list.append(("\n".join(str_list), teacher["id"]))

    print(len(update_list))
    u_sql = "update teacher set eduexp = %s where id = %s"
    print(dbs.exe_many(u_sql, update_list))
Esempio n. 13
0
File: test.py Progetto: fengges/eds
def mentor_extract():
    s_sql = "SELECT * FROM `pss_zhuanli_copy`;"
    info_list = dbs.getDics(s_sql)
    mentor_dict = {}.fromkeys(
        open('.\\qinghua\\mentor_list.txt', 'r',
             encoding='utf-8').read().split('\n'))
    print(mentor_dict)
    print("*" * 10)
    update_list = []
    for item in info_list:
        author_list = item['INVIEW'].split(';')
        mentor_list = []
        for author in author_list:
            if mentor_dict.get(author, "") != "":
                mentor_list.append(author)
        print(author_list, mentor_list)
        update_list.append((";".join(mentor_list), item['id']))

    u_sql = "UPDATE `pss_zhuanli_copy` SET MENTOR = %s WHERE id=%s"
    print(len(update_list))
    print(dbs.exe_many(u_sql, update_list))
Esempio n. 14
0
def clear_2():
    """
    保留只包含学历信息的句子
    :return:
    """
    s_sql = "select id, edu_exp from teacher_eduexp where type = 2"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))

    re_title = r'讲师|教授|指导|博士生导师|研究生导师|硕士生导师|从事|研究员|所长|院长|博导|硕导'
    re_job = r'任教|任|从事|留校|留院'
    re_publish = r'《|》|出版|学报|杂志'
    re_birth = r'[1-2][9,0][0-9]{2}生|出生|生于|年生'

    update_list = []
    num = 0
    for teacher in teacher_list:
        if teacher["edu_exp"] == "" or teacher["edu_exp"] is None:
            continue
        lines = teacher["edu_exp"].split('\n')
        new_lines = []
        for line in lines:
            if re.findall(re_title, line):
                continue
            if re.findall(re_job, line):
                continue
            if re.findall(re_publish, line):
                continue
            if re.findall(re_birth, line):
                continue
            new_lines.append(line)
        if new_lines:
            print(teacher["id"])
            num += 1
            update_list.append(('\n'.join(new_lines), 2, teacher["id"]))

    print(num)
    print(len(update_list))
    update_sql = "update teacher_eduexp set exp_clear = %s, clear=%s where id = %s"
    print(dbs.exe_many(update_sql, update_list))
Esempio n. 15
0
def get_total():
    """
    1:各维度归一化后相加
    2:非985/211的总分为0, 985学校总分*2
    3:重点实验室
    :return:
    """
    import pandas as pd

    s_sql = '''
        SELECT teacher_id, age, integrity, title, school, institution, main_lab, abroad FROM teacher_rank
    '''
    df = dbs.get_teacher_dataframe(s_sql)
    df_id = df['teacher_id']
    df_normal = df / df.max()
    df_normal['total'] = df_normal['age'] + df_normal['integrity'] + df_normal[
        'title'] + df_normal['school'] + df_normal['institution'] + df_normal[
            'main_lab'] + df_normal['abroad']

    u_list = []
    for index in df_normal.index:
        total = df_normal.loc[index].total
        id = df_id[index]

        # 非985/211的总分为0, 985学校总分*2
        if df_normal.loc[index].school == 0.0:
            total = 0
        elif df_normal.loc[index].school == 1.0:
            total = total + df_normal.loc[index].total

        # 重点实验室
        total = total + df_normal.loc[index].total * df_normal.loc[
            index].main_lab

        u_list.append((float(total), int(id)))

    print(len(u_list))
    u_sql = 'UPDATE teacher_rank SET total = %s WHERE teacher_id = %s'
    print(dbs.exe_many(u_sql, u_list))
    pass
Esempio n. 16
0
def get_school():
    """
    4.学校评价-school
    值  权重
    985 2
    211 1
    非  0
    :return: None
    """
    sql_initial = '''
        UPDATE teacher_rank
        SET school = 0
    '''
    print(dbs.exe_sql(sql_initial))

    s_sql = '''
        SELECT teacher.id as id, school_info.characteristic as characteristic
        FROM `teacher`, `school_info`
        WHERE teacher.school_id = school_info.id AND teacher.school_id != 0;
    '''
    import re
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    u_list = []
    for teacher in teacher_list:
        if teacher['characteristic'] is None or teacher['characteristic'] == "":
            continue
        if re.findall('985', teacher['characteristic']):
            u_list.append((2, teacher['id']))
        elif re.findall('211', teacher['characteristic']):
            u_list.append((1, teacher['id']))
    print(len(u_list))
    u_sql = '''
        UPDATE teacher_rank
        SET school=%s
        WHERE teacher_id=%s
    '''
    print(dbs.exe_many(u_sql, u_list))
    pass
Esempio n. 17
0
File: test.py Progetto: fengges/eds
def teacher_extract():
    s_sql = "SELECT * FROM `pss_zhuanli_copy`;"
    info_list = dbs.getDics(s_sql)

    teacher_dict = {}.fromkeys(
        open('C:\\Users\\Administrator\\Desktop\\teacher.txt',
             'r',
             encoding='utf-8').read().split('\n'))
    print(teacher_dict)
    print("*" * 10)
    update_list = []
    for item in info_list:
        author_list = item['INVIEW'].split(';')
        teacher_list = []
        for author in author_list:
            if teacher_dict.get(author, "") != "":
                teacher_list.append(author)
        print(author_list, teacher_list)
        update_list.append((";".join(teacher_list), item['id']))

    u_sql = "UPDATE `pss_zhuanli_copy` SET TEACHERS = %s WHERE id=%s"
    print(len(update_list))
    print(dbs.exe_many(u_sql, update_list))
Esempio n. 18
0
def get_age():
    select_sql = "SELECT * from teacherdata_info"
    teacherdata = dbs.getDics(select_sql)
    print(len(teacherdata))

    uplist = []
    extractor = Extractor()
    for teacher in teacherdata:
        if not teacher['homepage'].find('http://ckspkk.eol.cn') == -1:
            info = eval(teacher['info'])
            # print(info)
            birthday = info.get('出生年月', '')
            person_info = info.get('个人简介', None)
            # ---有出生日期
            if len(birthday) > 3 and birthday != "":
                updata = (birthday[:4] + '-出生', teacher['id'])
                uplist.append(updata)
            # ---没有出生日期
            else:
                if person_info is None:
                    continue
                extractor.set_text(person_info)
                birthyear = extractor.get_birthday()
                updata = (birthyear, teacher['id'])
                uplist.append(updata)
        else:
            try:
                info = eval(teacher['info'])
                person_info = "".join(list(info.values()))
            except Exception as e:
                person_info = teacher['info']

            if person_info is None:
                continue
            extractor.set_text(person_info)
            birthyear = extractor.get_birthday()
            updata = (birthyear, teacher['id'])
            uplist.append(updata)

    uplistNew = []
    for node in uplist:
        age_description = node[0]
        age = 0
        if age_description == '':
            age = 0
        else:
            year = int(age_description.split('-')[0])
            year_type = age_description.split('-')[1]
            if year_type == '出生':
                age = 2018 - year
            elif year_type == '学士':
                age = 2018 - year + 22
            elif year_type == '硕士':
                age = 2018 - year + 25
            elif year_type == '博士':
                age = 2018 - year + 30

        if age > 100 or age < 20:
            age = 0
            age_description = ''
        data = (age_description, age, node[1])
        uplistNew.append(data)

    print(len(uplistNew))
    update_sql = "update teacher_age set age_description=%s,age=%s where id=%s"
    print(dbs.exe_many(update_sql, li=uplistNew))