def deal31():
    dbutil = DBUtil(host, port, database, username, password)

    sql = 'select * from author_info_new'
    author_info_new0910 = dbutil.get_allresult(sql, 'df')

    sql = 'select * from h_index'
    h_index0910 = dbutil.get_allresult(sql, 'df')

    sql = 'select * from article_cite_data'
    article_cite_data0910 = dbutil.get_allresult(sql, 'df')

    sql = 'select scopus_id, person_id, publish_year, count(*) as first_au_doc_num from scopus_author_article ' \
          'group by scopus_id, person_id, publish_year'
    scopus_author_article0910 = dbutil.get_allresult(sql, 'df')

    physics_list = pd.read_excel(
        'C:/Users/Administrator/Desktop/物理学人才清单_20200908.xlsx',
        sheet_name='data')
    physics_list['人才编号'] = physics_list['人才编号'].astype('str')

    author_info_new0910 = author_info_new0910.loc[
        author_info_new0910['person_id'].isin(list(physics_list['人才编号']))]
    h_index0910 = h_index0910.loc[h_index0910['person_id'].isin(
        list(physics_list['人才编号']))]
    article_cite_data0910 = article_cite_data0910.loc[
        article_cite_data0910['person_id'].isin(list(physics_list['人才编号']))]
    scopus_author_article0910 = scopus_author_article0910.loc[
        scopus_author_article0910['person_id'].isin(list(
            physics_list['人才编号']))]

    dbutil.df_insert('author_info_new0910', author_info_new0910)
    dbutil.df_insert('h_index0910', h_index0910)
    dbutil.df_insert('article_cite_data0910', article_cite_data0910)
    dbutil.df_insert('scopus_author_article0910', scopus_author_article0910)
def deal6():
    dbutil = DBUtil(host, port, database, username, password)
    sql = "select person_id, name from not_matched_author where data_no='2020060819255418'"
    df = dbutil.get_allresult(sql, 'df')
    dbutil.close()

    df2 = pd.read_excel(
        'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx')

    df2['学者代码'] = df2['学者代码'].astype('str')
    result = pd.merge(df,
                      df2,
                      how='left',
                      left_on='person_id',
                      right_on='学者代码')
    result.sort_values(by=['学者代码'], inplace=True, ignore_index=True)

    search_url = 'https://www.scopus.com/results/authorNamesList.uri?sort=count-f&src=al' \
                 '&st1={}' \
                 '&st2={}' \
                 '&orcidId=&affilName={}' \
                 '&sot=anl&sdt=anl&sl=64&resultsPerPage=200&offset=1&jtp=false&currentPage=1&exactAuthorSearch=true'

    for i in range(len(result)):
        first_name = result.loc[i, 'name'].split(' ')[0].lower()
        last_name = result.loc[i, 'name'].split(' ')[1].lower()
        ins_name = '+'.join(result.loc[i, 'ins_en'].split(' ')).lower()
        result.loc[i, '搜索链接'] = search_url.format(first_name, last_name,
                                                  ins_name)

    result.to_excel('C:/Users/Administrator/Desktop/0609not.xlsx',
                    sheet_name='Sheet1',
                    index=False)
def rf_cal():
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select scopus_journal_id, cite_num+0 as cite_num, cite_journal_id from scopus_cite_data ' \
          'where scopus_journal_id!=cite_journal_id'
    all_cite_data = dbutil.get_allresult(sql, 'df')

    # with open('core_journal.json', 'r', encoding='utf-8') as cj:
    #     core_journal = json.load(cj)

    core_journal = {}
    already_data = pd.read_excel('C:/Users/Administrator/Desktop/分学科结果.xlsx')
    already_data['scopus_journal_id'] = already_data[
        'scopus_journal_id'].astype('str')
    for subject, sub_df in already_data.groupby('subject'):
        core_journal[subject] = list(sub_df['scopus_journal_id'])

    rf_cited_list = []
    rf_citing_list = []
    for subject, sub_journal in core_journal.items():
        print(subject)
        sub_cited_df_temp = all_cite_data.loc[
            all_cite_data['cite_journal_id'].isin(sub_journal)]
        sub_cited_df = sub_cited_df_temp.groupby(
            'scopus_journal_id', as_index=False)['cite_num'].sum()
        sub_citing_df_temp = all_cite_data.loc[
            all_cite_data['scopus_journal_id'].isin(sub_journal)]
        sub_citing_df = sub_citing_df_temp.groupby(
            'cite_journal_id', as_index=False)['cite_num'].sum()

        sub_cited_df['subject'] = subject
        rf_cited_list.append(sub_cited_df)

        sub_citing_df['subject'] = subject
        rf_citing_list.append(sub_citing_df)

    rf_cited_df = pd.concat(rf_cited_list)
    rf_citing_df = pd.concat(rf_citing_list)

    # dbutil.df_insert('rf_cited_data', rf_cited_df)
    # dbutil.df_insert('rf_citing_data', rf_citing_df)

    select_journal = pd.read_excel(
        'C:/Users/Administrator/Desktop/未被分学科期刊.xlsx')
    select_journal['Scopus Source ID'] = select_journal[
        'Scopus Source ID'].astype('str')
    rf_cited_df = rf_cited_df.loc[rf_cited_df['scopus_journal_id'].isin(
        list(select_journal['Scopus Source ID']))]
    rf_citing_df = rf_citing_df.loc[rf_citing_df['cite_journal_id'].isin(
        list(select_journal['Scopus Source ID']))]

    dbutil.df_insert('rf_cited_data_last', rf_cited_df)
    dbutil.df_insert('rf_citing_data_last', rf_citing_df)

    dbutil.close()
def deal10():
    dbutil = DBUtil(host, port, database, username, password)
    sql = "select person_id+0 as person_id from (select DISTINCT person_id from find_result UNION select " \
          "person_id from not_find) a ORDER BY person_id"
    df = dbutil.get_allresult(sql, 'df')

    df2 = pd.read_excel('C:/Users/Administrator/Desktop/0618webofscience.xlsx')
    df3 = df2.loc[~df2['person_id'].isin(list(df['person_id']))]
    df3.to_excel('C:/Users/Administrator/Desktop/data_left.xlsx',
                 sheet_name='Sheet1',
                 index=False)
def deal8():
    df = pd.read_excel('C:/Users/Administrator/Desktop/0618webofscience.xlsx')

    dbutil = DBUtil(host, port, database, username, password)
    sql = "select distinct (person_id+0) as person_id from not_find where person_id not in (select person_id from find_result)"
    not_find = dbutil.get_allresult(sql, 'df')
    dbutil.close()
    result = pd.merge(not_find, df, how='left', on='person_id')
    result.to_excel('C:/Users/Administrator/Desktop/未搜索到结果清单.xlsx',
                    sheet_name='Sheet1',
                    index=False)
def data_prepare():
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select scopus_journal_id, sum(cite_num+0) as cite_num_all from scopus_cite_data group by scopus_journal_id'
    df1 = dbutil.get_allresult(sql, 'df')

    sql = 'SELECT scopus_journal_id,cite_num+0 as cite_num_self from scopus_cite_data ' \
          'where scopus_journal_id=cite_journal_id'
    df2 = dbutil.get_allresult(sql, 'df')

    df1 = pd.merge(df1, df2, how='outer', on='scopus_journal_id')
    df1.fillna(0, inplace=True)
    df1['cite_num_noself'] = df1['cite_num_all'] - df1['cite_num_self']
    df1.to_excel('C:/Users/Administrator/Desktop/去除自引被引数据.xlsx', index=False)

    sql = 'select cite_journal_id as scopus_journal_id, sum(cite_num+0) as cite_num_all ' \
          'from scopus_cite_data group by cite_journal_id'
    df3 = dbutil.get_allresult(sql, 'df')
    df3 = pd.merge(df3, df2, how='outer', on='scopus_journal_id')
    df3.fillna(0, inplace=True)
    df3['cite_num_noself'] = df3['cite_num_all'] - df3['cite_num_self']
    df3.to_excel('C:/Users/Administrator/Desktop/去除自引引用数据.xlsx', index=False)
def temp(input_data):
    from src.config.DBUtil import DBUtil
    from src.Scopus_Crawler.scopus_config import host, port, database, username, password

    dbutil = DBUtil(host, port, database, username, password)
    sql = "SELECT person_id FROM author_info_new UNION SELECT person_id FROM mult_matched_author UNION SELECT " \
          "person_id FROM not_matched_author"
    df = dbutil.get_allresult(sql, 'df')
    dbutil.close()
    input_list = [str(i['person_id']) for i in input_data]
    diff = set(input_list).difference(set(df['person_id']))
    out_data = [i for i in input_data if str(i['person_id']) in diff]
    return out_data
def deal9():
    dbutil = DBUtil(host, port, database, username, password)
    sql = "select person_id+0 as person_id, find_num, ins from find_result"
    find_result = dbutil.get_allresult(sql, 'df')
    find_result['s_num'] = '0'
    for i in range(len(find_result)):
        temp_list = find_result.loc[i, 'ins'][:-1].split(' (')
        find_result.loc[i, 'ins'] = temp_list[0]
        find_result.loc[i, 's_num'] = temp_list[1]

    df = pd.read_excel('C:/Users/Administrator/Desktop/0618webofscience.xlsx')
    result = pd.merge(find_result, df, on='person_id', how='left')
    result.set_index(['person_id', 'JCR期刊列表', '姓名', 'find_num', 'ins_x'],
                     inplace=True)
    result.to_excel('C:/Users/Administrator/Desktop/有搜索结果的数据.xlsx',
                    sheet_name='Sheet1')
def deal1():
    dbutil = DBUtil(host, port, database, username, password)
    sql = "select DISTINCT person_id, name, scopus_id from author_info_new where data_no='2020060819255418'"
    author_list1 = dbutil.get_allresult(sql, 'df')
    dbutil.close()

    author_list2 = pd.read_excel(
        'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx',
        sheet_name='Sheet1')
    author_list2['学者代码'] = author_list2['学者代码'].astype('str')

    author_list = pd.merge(author_list1,
                           author_list2,
                           how='left',
                           left_on='person_id',
                           right_on='学者代码')
    author_list.drop_duplicates(subset=['person_id'],
                                inplace=True,
                                keep='first')
    author_list.reset_index(drop=True, inplace=True)

    author_list['详细链接'] = 'a'
    author_list['百度搜索链接'] = 'b'
    author_list['搜索链接'] = 'c'

    search_url = 'https://www.scopus.com/results/authorNamesList.uri?sort=count-f&src=al' \
                 '&st1={}' \
                 '&st2={}' \
                 '&orcidId=&affilName={}' \
                 '&sot=anl&sdt=anl&sl=64&resultsPerPage=200&offset=1&jtp=false&currentPage=1&exactAuthorSearch=true'

    for i in range(len(author_list)):
        author_list.loc[
            i,
            '详细链接'] = 'https://www.scopus.com/authid/detail.uri?authorId=%s' % author_list.loc[
                i, 'scopus_id']
        first_name = author_list.loc[i, 'name'].split(' ')[0].lower()
        last_name = author_list.loc[i, 'name'].split(' ')[1].lower()
        ins_name = '+'.join(author_list.loc[i, 'ins_en'].split(' ')).lower()
        author_list.loc[i, '搜索链接'] = search_url.format(first_name, last_name,
                                                       ins_name)
        author_list.loc[i, '百度搜索链接'] = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=%s' \
                                       % author_list.loc[i, '头衔当选单位'] + author_list.loc[i, '姓名']

    author_list.to_excel('C:/Users/Administrator/Desktop/check_data.xlsx',
                         index=False,
                         encoding='utf-8')
def deal30():
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select distinct person_id from author_info_new'
    df = dbutil.get_allresult(sql, 'df')
    input_df = pd.read_excel(
        'C:/Users/Administrator/Desktop/物理学人才清单_20200908.xlsx',
        sheet_name='Sheet2')
    input_df['人才编号'] = input_df['人才编号'].astype('str')
    input_df = input_df.loc[~input_df['人才编号'].isin(list(df['person_id']))]

    input_df[
        '百度搜索'] = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=' + input_df[
            '当选单位信息'] + input_df['姓名']
    input_df['百度学术搜索'] = 'https://xueshu.baidu.com/usercenter/data/authorchannel?cmd=inject_page&author=' + \
                               input_df['姓名'] + '&affiliate=' + input_df['当选单位信息']
    input_df.to_excel('C:/Users/Administrator/Desktop/人工查找0909.xlsx',
                      index=False)
def deal21():
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select * from wos_doc_data'
    df = dbutil.get_allresult(sql, 'df')
    df['doc_num'] = df['doc_num'].astype('int')

    df2 = pd.read_excel(
        'C:/Users/Administrator/Desktop/cssc-category-mapping.xlsx',
        sheet_name='Sheet1')
    df2['id'] = df2['id'].astype('str')
    for i in range(len(df2)):
        if len(df2.loc[i, 'id']) < 4:
            df2.loc[i, 'id'] = '0' + df2.loc[i, 'id']

    df = pd.merge(df, df2, on='category_id')
    result = df.groupby(by=['orgName', 'year', 'id', 'Description'],
                        as_index=False)['doc_num'].sum()
    dbutil.df_insert('doc_data', result)
def deal5():
    dbutil = DBUtil(host, port, database, username, password)
    sql = "select DISTINCT person_id from mult_matched_author where data_no='2020052716115197'"
    df = dbutil.get_allresult(sql, 'df')
    dbutil.close()

    df2 = pd.read_excel(
        'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx')

    df2['学者代码'] = df2['学者代码'].astype('str')
    result = pd.merge(df,
                      df2,
                      how='left',
                      left_on='person_id',
                      right_on='学者代码')
    result.drop_duplicates(subset=['person_id'],
                           inplace=True,
                           ignore_index=True)

    result.to_excel('C:/Users/Administrator/Desktop/0605.xlsx',
                    sheet_name='Sheet1',
                    index=False)
def last_process():
    dbutil = DBUtil(host, port, database, username, password)

    # sql = 'select scopus_journal_id, cite_num as rf_cited_num, subject from rf_cited_data'
    # rf_cited_data = dbutil.get_allresult(sql, 'df')
    #
    # sql = 'select cite_journal_id as scopus_journal_id, cite_num as rf_citing_num, subject from rf_citing_data'
    # rf_citing_data = dbutil.get_allresult(sql, 'df')

    sql = 'select scopus_journal_id, cite_num as rf_cited_num, subject from rf_cited_data_last'
    rf_cited_data = dbutil.get_allresult(sql, 'df')

    sql = 'select cite_journal_id as scopus_journal_id, cite_num as rf_citing_num, subject from rf_citing_data_last'
    rf_citing_data = dbutil.get_allresult(sql, 'df')

    sql = 'select * from cited_data'
    sum_cited_data = dbutil.get_allresult(sql, 'df')

    sql = 'select * from citing_data'
    sum_citing_data = dbutil.get_allresult(sql, 'df')

    rf_cited_data = pd.merge(rf_cited_data,
                             sum_cited_data,
                             how='left',
                             on='scopus_journal_id')
    rf_cited_data['rf_cited_value'] = rf_cited_data[
        'rf_cited_num'] / rf_cited_data['cited_num']
    rf_cited_sum = rf_cited_data.groupby(
        'scopus_journal_id', as_index=False)['rf_cited_value'].sum()
    rf_cited_sum.rename(columns={'rf_cited_value': 'rf_cited_sum'},
                        inplace=True)
    rf_cited_data = pd.merge(rf_cited_data,
                             rf_cited_sum,
                             on='scopus_journal_id')
    rf_cited_data['rf_cited_percent'] = rf_cited_data[
        'rf_cited_value'] / rf_cited_data['rf_cited_sum']

    rf_citing_data = pd.merge(rf_citing_data,
                              sum_citing_data,
                              how='left',
                              on='scopus_journal_id')
    rf_citing_data['rf_citing_value'] = rf_citing_data[
        'rf_citing_num'] / rf_citing_data['citing_num']
    rf_citing_sum = rf_citing_data.groupby(
        'scopus_journal_id', as_index=False)['rf_citing_value'].sum()
    rf_citing_sum.rename(columns={'rf_citing_value': 'rf_citing_sum'},
                         inplace=True)
    rf_citing_data = pd.merge(rf_citing_data,
                              rf_citing_sum,
                              on='scopus_journal_id')
    rf_citing_data['rf_citing_percent'] = rf_citing_data[
        'rf_citing_value'] / rf_citing_data['rf_citing_sum']

    rf_data = pd.merge(rf_cited_data,
                       rf_citing_data,
                       how='outer',
                       on=['scopus_journal_id', 'subject'])

    rf_data.drop(columns=['cited_num', 'citing_num'], inplace=True)
    rf_data = pd.merge(rf_data,
                       sum_cited_data,
                       how='left',
                       on='scopus_journal_id')
    rf_data = pd.merge(rf_data,
                       sum_citing_data,
                       how='left',
                       on='scopus_journal_id')
    rf_data.fillna(0, inplace=True)
    rf_data['scopus_journal_id'] = rf_data['scopus_journal_id'].astype('int64')

    journal_name_data = pd.read_excel(
        'C:/Users/Administrator/Desktop/Journal Citation Score 2019带一级学科信息20200726.xlsx'
    )
    journal_name_data.drop_duplicates(subset=['Scopus Source ID'],
                                      inplace=True,
                                      ignore_index=True)

    with open('core_journal.json', 'r', encoding='utf-8') as cj:
        core_journal = json.load(cj)

    journal_name_data_temp = journal_name_data.loc[:, [
        'Scopus Source ID', 'Scholarly Output'
    ]]
    journal_name_data_temp['Scopus Source ID'] = journal_name_data_temp[
        'Scopus Source ID'].astype('str')
    core_journal_article_num = {}
    for subject, value in core_journal.items():
        core_journal_article_num[subject] = sum(
            journal_name_data_temp[journal_name_data_temp['Scopus Source ID'].
                                   isin(value)]['Scholarly Output'])

    rf_data['cited_citing_sum'] = rf_data['citing_num'] + rf_data['cited_num']
    rf_data['cited_citing_percent_sum'] = rf_data[
        'rf_cited_percent'] + rf_data['rf_citing_percent']
    # 获取中文名称
    journal_name_zh = pd.read_excel(
        'C:/Users/Administrator/Desktop/期刊翻译结果.xlsx')
    rf_data = pd.merge(rf_data,
                       journal_name_zh,
                       how='left',
                       on='scopus_journal_id')

    # rf_data.to_excel('C:/Users/Administrator/Desktop/rf_data.xlsx', index=False)

    rf_data.to_excel('C:/Users/Administrator/Desktop/rf_data_last.xlsx',
                     index=False)

    rf_data_result = rf_data

    core_journal_num_dict = {}
    for subject, value in core_journal.items():
        core_journal_num_dict[subject] = len(value)

    rf_data_result['core_journal数量'] = rf_data_result['subject']
    rf_data_result['core_journal数量'].replace(core_journal_num_dict,
                                             inplace=True)
    rf_data_result['core_journal数量'] = rf_data_result['core_journal数量'].astype(
        'int64')

    rf_data_result['core_journal发文数量'] = rf_data_result['subject']
    rf_data_result['core_journal发文数量'].replace(core_journal_article_num,
                                               inplace=True)
    rf_data_result['core_journal发文数量'] = rf_data_result[
        'core_journal发文数量'].astype('int64')

    rf_data_result1 = copy.deepcopy(rf_data_result)
    rf_data_result2 = copy.deepcopy(rf_data_result)

    # 分学科结果,原始结果
    # 合并core_journal学科
    select_core_journal = pd.read_excel(
        'C:/Users/Administrator/Desktop/core_journal-new.xlsx')
    select_core_journal = select_core_journal.loc[
        select_core_journal['Core journal'].notnull(),
        ['scopus_journal_id', 'Core journal']]
    rf_data_result2 = pd.merge(rf_data_result2,
                               select_core_journal,
                               how='left',
                               on='scopus_journal_id')

    rf_data_result2 = rf_data_result2.loc[
        (rf_data_result2['cited_citing_percent_sum'] >= 0.4) |
        (rf_data_result2['subject'] == rf_data_result2['Core journal'])]

    # rf_data_result2.to_excel('C:/Users/Administrator/Desktop/分学科结果.xlsx', index=False)

    rf_data_result2.to_excel('C:/Users/Administrator/Desktop/分学科结果last.xlsx',
                             index=False)
Exemple #14
0
                element = h_index.find(class_='fontLarge')
                if element:
                    result_list.append(
                        [input_data[i][0], input_data[i][1], element.text])

            # 结束循环
            count = len(input_data)

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            count = i

        # 将已完成的部分进行数据写入
        result_df = pd.DataFrame(data=result_list,
                                 columns=['person_id', 'scopus_id', 'h_index'])
        write2sql([['h_index', result_df]])


if __name__ == '__main__':
    # 单元测试用
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select distinct person_id, scopus_id from author_info_new where ' \
          'person_id not in (select person_id from h_index)'
    author_id_df = dbutil.get_allresult(sql, 'df')
    dbutil.close()

    input_data = author_id_df.values.tolist()
    crawl_h_index(input_data)
    print('****** END ******')
# dbutil = DBUtil(host, port, database, username, password)
# sql = 'select * from wos_doc_data_copy'
# data = dbutil.get_allresult(sql, 'df')
# print('读取源数据完成')
# df = pd.read_excel('C:/Users/Administrator/Desktop/校名映射表.xlsx')
# data = pd.merge(data, df, how='left', on='orgName')
#
# print('merge完成')
# print('开始写入数据库')
# write2sql([['wos_doc_data0930', data]])
# print('写入数据库完成')

dbutil = DBUtil(host, port, database, username, password)
sql = 'select * from wos_doc_data_detail'
data = dbutil.get_allresult(sql, 'df')
print('读取源数据完成')
df = pd.read_excel('C:/Users/Administrator/Desktop/校名映射表.xlsx')
data = pd.merge(data, df, how='left', on='orgName')

data1 = data.loc[0:499999]
data2 = data.loc[500000:999999]
data3 = data.loc[1000000:1499999]
data4 = data.loc[1500000:1999999]
data5 = data.loc[2000000:2499999]
data6 = data.loc[2500000:2999999]
data7 = data.loc[3000000:]

data_list = [data1, data2, data3, data4, data5, data6, data7]

i = 0
                    })
                cite_df_temp['cite_num'] = 1
                cite_journal_data = cite_df_temp.groupby(
                    by=['cite_journal',
                        'cite_journal_id'], as_index=False).sum()
                cite_journal_data['scopus_journal_id'] = input_data[i][0]
                result_df_list.append(cite_journal_data)

            count = len(input_data)

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            count = i

        if result_df_list:
            all_data = pd.concat(result_df_list)
            write2sql([['scopus_cite_data0810', all_data]])


if __name__ == '__main__':
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select scopus_journal_id from scopus_journal_id where scopus_journal_id not ' \
          'in (select distinct scopus_journal_id from scopus_cite_data0810) and cite_num!="0"'
    journal_id_df = dbutil.get_allresult(sql, 'df')
    dbutil.close()

    input_data = journal_id_df.values.tolist()
    # input_data = input_data[:5]
    get_cite_data(input_data)
#! /usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd

from src.config.DBUtil import DBUtil
from src.Scopus_Crawler.scopus_config import host, port, database, username, password

dbutil = DBUtil(host, port, database, username, password)
sql = 'select * from wos_article_data'
already_df = dbutil.get_allresult(sql, 'df')

already_df.drop_duplicates(subset=['UT'], ignore_index=True, inplace=True)

dbutil.df_insert('wos_article_data_final', already_df)
dbutil.close()
                    data_df['orgName'] = input_data[i][0]
                    result_df_list.append(data_df)

            count = len(input_data)
            session.close()

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            session.close()
            count = i

        if result_df_list:
            all_data_df = pd.concat(result_df_list)
            write2sql([['wos_doc_data_detail', all_data_df]])


if __name__ == '__main__':
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select orgName, `key`, year, category_id, category_name from wos_doc_data_copy'
    input_df = dbutil.get_allresult(sql, 'df')
    input_df.drop_duplicates(
        subset=['orgName', 'key', 'category_id', 'category_name'],
        inplace=True)
    dbutil.close()

    input_data = input_df.values.tolist()

    for rdd in range(9):
        get_doc_data(input_data[5000 * rdd:5000 * rdd + 5000])