def RK_LASD(x, pt): name = 'Results/RK2_201' + str(x) + '.xls' print('Writing to ' + name + ' ... ...') initExcel_rk2(name) url = URL + str(x) + '_' for i in range(4): url1 = url + str(i) + '.html' html = mr.getOnePage(url1, 'utf-8') pattern = re.compile(pt[i], re.S) result = re.findall(pattern, html) if i == 0: t = 3 l = len(result[0]) for item in result: for j in range(l): mr.writeToExcel(name, t, j, item[j]) print(t - 2) t += 1 else: t = 3 l = len(result[0]) k = RC2[i] + 1 for item in result: for j in range(l): mr.writeToExcel(name, t, j + k, item[j]) print(t - 2) t += 1
def QS0(): for i in range(11): url = 'https://www.universityrankings.ch/results?ranking=QS®ion=Asia&year=20' + str( 10 + i) + '&q=China+' html = mr.getOnePage(url, 'utf-8') pattern = re.compile(PATTERN, re.S) result = re.findall(pattern, html) url = 'https://www.universityrankings.ch' + result[0] file_name = 'QS20' + s + '.csv' mr.saveOneFile(file_name, url)
def XYH2017(): name = 'Results/XYH2017-2018.xls' print('Writing to ' + name + ' ... ...') mr.initExcel(ROW2017, name) url = 'https://www.dxsbb.com/news/1383.html' html = mr.getOnePage(url, 'gbk') pattern = re.compile( '<tr height="19"><td x:num="(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td x:num="(.*?)</td></tr>', re.S) result = re.findall(pattern, html) t = 1 pattern1 = re.compile("[\u4e00-\u9fa5]+", re.S) pattern2 = re.compile("\d+\.?\d*", re.S) for item in result: for i in range(5): if i == 1: r = re.findall(pattern1, item[i]) mr.writeToExcel(name, t, i, "".join(r)) elif i == 0 or i == 4: r = re.findall(pattern2, item[i]) mr.writeToExcel(name, t, i, r[0]) else: mr.writeToExcel(name, t, i, item[i]) print(t) t += 1
def Times(x): name = 'Results/Times201' + str(x) + '.xls' print('Writing to ' + name + ' ... ...') mr.initExcel(ROW, name) url = URL1 + str(x) + URL2 html = mr.getOnePage(url, 'utf-8') pattern = re.compile(PATTERN1, re.S) result = re.findall(pattern, html) t = 1 l = len(result[0]) for item in result: for i in range(l): mr.writeToExcel(name, t, i, item[i]) print(t) t += 1 url = URL1 + str(x) + URL3 html = mr.getOnePage(url, 'utf-8') pattern = re.compile(PATTERN2, re.S) result = re.findall(pattern, html) t = 1 l = len(result[0]) for item in result: for i in range(l): mr.writeToExcel(name, t, i + 6, item[i]) print(t) t += 1
def RK_ZHDX(x, row, pt): name = 'Results/RK201' + str(x) + '.xls' print('Writing to ' + name + ' ... ...') mr.initExcel(row, name) url = URL + str(x) + '.html' html = mr.getOnePage(url, 'utf-8') pattern = re.compile(pt, re.S) result = re.findall(pattern, html) t = 1 l = len(result[0]) for item in result: for i in range(l): mr.writeToExcel(name, t, i, item[i]) print(t) t += 1
def Scholars(): count = [0, 0, 0] url = 'http://www.nenu.edu.cn/576/list.htm' global driver # global driver fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.set_headless() driver = webdriver.Firefox(firefox_options=fireFoxOptions) # driver = webdriver.Firefox() driver.get(url) elements = getEleById('wp_content_w8_0') elements = elements.text driver.close() del driver count[0] = mr.parse2(elements, '院士', '荣誉教授') count[1] += parse3(elements, '教育部“长江学者奖励计划”特聘教授', '教育部“长江学者奖励计划”青年学者') count[1] += parse3(elements, '教育部“长江学者奖励计划”青年学者', '教育部“长江学者和创新团队发展计划”带头人') count[1] += parse3(elements, '教育部“长江学者和创新团队发展计划”带头人', '中国科学院“百人计划”') count[2] += parse3(elements, '国家“万人计划”哲学社会科学领军人才', '国家“万人计划”科技创新领军人才') count[2] += parse3(elements, '国家“万人计划”科技创新领军人才', '国家“万人计划”教学名师') count[2] += parse3(elements, '国家“万人计划”教学名师', '国家杰出青年科学基金获得者') count[2] += parse3(elements, '国家“万人计划”青年拔尖人才', '国家优秀青年科学基金获得者') print(count) writeToExcel(4, count[0]) writeToExcel(5, count[1]) writeToExcel(7, count[2])
def YangtzeRiverScholars(): global count_3 url = 'http://www.math.lb.pku.edu.cn/jsdw/rcjh/index.htm' html = mr.getOnePage(url) for item in parseOnePage_YangtzeRiverScholars(html): print(item) # writeToFile(YANG, item) count_3 += 1
def Academician(offset): global count_2 url = 'http://www.math.lb.pku.edu.cn/jsdw/zgkxyys/index' + offset + '.htm' html = mr.getOnePage(url) for item in parseOnePage_Academician(html): print(item) # writeToFile(ACA, item) count_2 += 1
def Library(): url = 'http://lib.csu.edu.cn/bgjs.jhtml' html = mr.getOnePage(url) pattern = re.compile('纸质文献总量(.*?)万余册', re.S) item = re.findall(pattern, html)[0] count1 = int(float(item) * 10000) print('中外文藏书合计: ' + str(count1)) writeToExcel(37, count1)
def Library(): url = 'http://www.lib.sdu.edu.cn/page/about.html' html = mr.getOnePage(url) pattern = re.compile('馆藏纸质文献(.*?)万余册', re.S) item = re.findall(pattern, html)[0] count1 = int(float(item) * 10000) print('中外文藏书合计: ' + str(count1)) writeToExcel(37, count1)
def RecruitmentProgram(): global count_4 url = 'http://www.math.lb.pku.edu.cn/jsdw/rcjh/index.htm' html = mr.getOnePage(url) for item in parseOnePage_RecruitmentProgram(html): print(item) # writeToFile(REC, item) count_4 += 1
def Scholars(): url = 'http://math.jlu.edu.cn/szdw/spys.htm' html = mr.getOnePage(url) pattern = re.compile('中国科学院院士', re.S) items = re.findall(pattern, html) count = len(items) print('院士数量: ' + str(count)) writeToExcel(4, count)
def Professor(): count = [0, 0, 0] departments = ['sxyyysxx', 'xxyjskxx', 'glytjxx', 'gdsxjxyyjzx'] for dep in departments: c = parseOnePage_Professor(dep) print(c) count = mr.listAdd(count, c) print(count) for i, item in enumerate(count): writeToExcel(i + 1, item)
def Scholars(): url = 'http://math.bnu.edu.cn/jzg/rcjh/index.htm' html = mr.getOnePage(url) pattern = re.compile('长江学者特聘教授:(.*?)<br />', re.S) items = re.findall(pattern, html)[0] pattern = re.compile('([\u4E00-\u9FA5][\u4e00-\u9fa5\\s][\u4e00-\u9fa5])', re.S) count = len(re.findall(pattern, items)) print('长江学者人数: ' + str(count)) writeToExcel(5, count)
def main(): mr.initExcel() papers.papers() Peking_University.PKU() Sichuan_University.SCU() Fudan_University.FU() Central_South_University.CSU() Sun_Yat-sen_University.SYSU() Shandong_University.SDU() University_of_Science_and_Technology_of_China.USTC() Shanghai_Jiao_Tong_University.SJTU() Northest_Normal_University.NENU() Jilin_University.JLU() Nankai_University.NKU() Capital_Normal_University.CNU() Beijing_Normal_University.BNU() Tsinghua_University.THU() Awards.AWD() Others.OTH()
def Library(): global count_8 url = 'http://www.math.lb.pku.edu.cn/kxyj/ytsg/index.htm' html = mr.getOnePage(url) pattern = re.compile( '现有纸版文献.*?"font-size:16px">(.*?)</span>.*?外文期刊.*?"font-size:16px">(.*?)</span>.*?中文期刊.*?"font-size:16px">(.*?)</span>', re.S) item = re.findall(pattern, html)[0] count_8.append(item[0]) count_8.append(item[1] + item[2])
def parseOnePage_Professor(dep): positions = ['js', 'fjs', 'js1'] count = [0, 0, 0] for i, pos in enumerate(positions): url = 'http://math.csu.edu.cn/szdw/' + dep + '/' + pos + '.htm' html = mr.getOnePage(url) pattern = re.compile('font-size:9pt', re.S) items = re.findall(pattern, html) count[i] = int(len(items) / 2) return count
def Library(): url = 'http://lib.ustc.edu.cn/%e6%9c%ac%e9%a6%86%e6%a6%82%e5%86%b5/%e6%9c%ac%e9%a6%86%e7%ae%80%e4%bb%8b/' html = mr.getOnePage(url) pattern = re.compile('实体馆藏中外文书刊(.*?)万册.*?中外文电子期刊近(.*?)万种', re.S) item = re.findall(pattern, html)[0] count1 = int(float(item[0]) * 10000) print('中外文藏书合计: ' + str(count1)) writeToExcel(37, count1) count2 = int(float(item[1]) * 10000) print('中外文期刊种类: ' + str(count2)) writeToExcel(40, count2)
def Professors(): count = [0, 0, 0] url = 'http://math.ustc.edu.cn/new/teachers.php' global driver fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.set_headless() driver = webdriver.Firefox(firefox_options=fireFoxOptions) # driver = webdriver.Firefox() driver.get(url) element = getEleByXpath( '/html/body/table[3]/tbody/tr/td[3]/table[2]/tbody/tr[3]/td') element = element.text driver.close() del driver count[0] = mr.parse2(element, '教 授', '访问教授') count[1] = mr.parse2(element, '副教授', '特任副研究员') count[2] = mr.parse2(element, '讲师', '博士后') print(count) for i, item in enumerate(count): writeToExcel(i + 1, item)
def Library(): url = 'http://www.library.nenu.edu.cn/Menu/AboutUs/BGGK/TSGJJ.aspx' html = mr.getOnePage(url) pattern = re.compile( '藏书总量约.*?(\d+\.\d+).*?万册。图书.*?电子期刊.*?>(\d+)</span>.*?种,订购', re.S) items = re.findall(pattern, html)[0] count1 = int(float(items[0]) * 10000) print('中外文藏书合计: ' + str(count1)) writeToExcel(37, count1) print('中外文期刊种类: ' + items[1]) writeToExcel(40, int(items[1]))
def Library(): url = 'http://www.library.fudan.edu.cn/60/list.htm' html = mr.getOnePage(url) pattern = re.compile( '馆藏纸本文献资源约</span><span lang="EN-US" style="color:#333333;">(.*?)</span>.*?中文报刊</span><span lang="EN-US" style="color:#333333;">(.*?)</span>.*?外文报刊</span><span lang="EN-US" style="color:#333333;">(.*?)</span>', re.S) items = re.findall(pattern, html)[0] count1 = int(float(items[0]) * 10000) print('中外文藏书合计: ' + str(count1)) writeToExcel(37, count1) count2 = int(items[1]) + int(items[2]) print('中外文期刊种类: ' + str(count2)) writeToExcel(40, count2)
def Library(): url = 'http://lib.tsinghua.edu.cn/about/collection.html' html = mr.getOnePage(url) pattern = re.compile('实体馆藏总量约(.*?)万册(件).*?各类数据库(\d+)个;电子期刊(.*?)万种', re.S) items = re.findall(pattern, html)[0] count1 = int(float(items[0]) * 10000) print('中外文藏书合计: ' + str(count1)) writeToExcel(37, count1) count2 = int(float(items[2]) * 10000) print('中外文期刊种类: ' + str(count2)) writeToExcel(40, count2) print('购买数据库数量: ' + items[1]) writeToExcel(39, int(items[1]))
def Projects(): count = [0, 0] url = 'http://www.nenu.edu.cn/273/list.htm' html = mr.getOnePage(url) count[0] += parse4(html, '由国家社科基金资助设立的重大项目:', '由教育部社科司资助设立的重大课题:') count[0] += parse4(html, '由国家自然科学基金资助的、面向世界科学前沿的重大基础研究项目,如:', '由科技部资助的以国家重大需求为导向的重大科学问题研究项目(含课题),如:') count[1] += parse4(html, '由教育部社科司资助设立的重大课题:', '科技项目') count[1] += parse4(html, '由科技部资助的以国家重大需求为导向的重大科学问题研究项目(含课题),如:', '') print('国家科研项目数', count[0]) writeToExcel(25, count[0]) print('省部科研项目数', count[1]) writeToExcel(26, count[1])
def Library(): url = 'http://lib.jlu.edu.cn/portal/about/about.aspx' html = mr.getOnePage(url) pattern = re.compile('各类纸质书刊(\d+)万册,其中.*?订购中文期刊(\d+)种,外文期刊(\d+)种,报.*?西文文献数据库(\d+)种,中文数据库(\d+)种,中', re.S) items = re.findall(pattern, html)[0] count1 = int(float(items[0]) * 10000) print('中外文藏书合计: ' + str(count1)) writeToExcel(37, count1) count2 = int(items[1]) + int(items[2]) print('中外文期刊种类: ' + str(count2)) writeToExcel(40, count2) count3 = int(items[3]) + int(items[4]) print('购买数据库数量: ' + str(count3)) writeToExcel(39, count3)
def Library(): url = 'http://www.lib.bnu.edu.cn/content/guan-chang-ji-yu' html = mr.getOnePage(url) pattern = re.compile( '纸本文献总量达(.*?)万余册,中外文全文电子期刊(\d+)万余种,中外文.*?引进中外文数据库(\d+)个,自', re.S) items = re.findall(pattern, html)[0] count1 = int(float(items[0]) * 10000) print('中外文藏书合计: ' + str(count1)) writeToExcel(37, count1) count2 = int(float(items[1]) * 10000) print('中外文期刊种类: ' + str(count2)) writeToExcel(40, count2) print('购买数据库数量: ' + items[2]) writeToExcel(39, int(items[2]))
def ResearchAward(): global count_5, count_6, count_7 url = 'http://www.math.lb.pku.edu.cn/kxyj/kyjl/index.htm' html = mr.getOnePage(url) for item in parseOnePage_ResearchAward_1(html): print(item) # writeToFile(RES, item) count_5 += 1 for item in parseOnePage_ResearchAward_2(html): print(item) # writeToFile(RES, item) count_6 += 1 for item in parseOnePage_ResearchAward_3(html): print(item) # writeToFile(RES, item) count_7 += 1
def Professor(offset): global count_1 url = 'http://www.math.lb.pku.edu.cn/jsdw/js_20180628175159671361/index' + str( offset) + '.htm' html = mr.getOnePage(url) for item in parseOnePage_Professor(html): print(item) # writeToFile(PRO, item) if item['position'] == '教授': count_1[0] += 1 elif item['position'] == '副教授': count_1[1] += 1 elif item['position'] == '讲师': count_1[2] += 1 else: count_1[3] += 1
def Professor(): url = 'http://sms.nankai.edu.cn/5542/list.htm' html = mr.getOnePage(url) pattern = re.compile( '数学学科现有教师(\d+)人,教授(\d+)人、博士.*?博士学位的(\d+)人。其中,中国科学院院士(\d+)人、第三', re.S) items = re.findall(pattern, html)[0] count1 = items[1] print('教授数量: ' + str(count1)) writeToExcel(1, count1) count2 = int(items[2]) / int(items[0]) count2 = '%.2f%%' % (count2 * 100) print('教师博士数量: ' + str(count2)) writeToExcel(8, count2) count3 = items[3] print('院士数量: ' + str(count3)) writeToExcel(4, count3)
def ResearchAward(): count = [0, 0, 0] url = 'http://www.nenu.edu.cn/275/list.htm' html = mr.getOnePage(url) pattern = re.compile( '学术专著(\d+)部,获得省部级科研奖励(\d+)项。6部著.*?文库》;(\d+)项成果获得全国高等.*?奖2项);(\d+)项成果获全国教育', re.S) items = re.findall(pattern, html)[0] count[0] = int(items[2]) + int(items[3]) count[1] = int(items[1]) count[2] = int(items[0]) print('国家奖', count[0]) writeToExcel(9, count[0]) print('省部级奖', count[1]) writeToExcel(10, count[1]) print('学术专著', count[2]) writeToExcel(19, count[2])
def WSL2(row, x, url, pt): name = 'Results/WSL201' + str(x) + '.xls' print('Writing to ' + name + ' ... ...') mr.initExcel(row, name) url = 'https://www.dxsbb.com/news/' + url html = mr.getOnePage(url, 'gbk') pattern = re.compile(pt, re.S) result = re.findall(pattern, html) t = 1 l = len(result[0]) pattern1 = re.compile("[\u4e00-\u9fa5]+", re.S) for item in result: for i in range(l): if i == 1: r = re.findall(pattern1, item[i]) mr.writeToExcel(name, t, i, "".join(r)) else: mr.writeToExcel(name, t, i, item[i]) print(t) t += 1