def getBrief(this): ''' 抽取scholar的个人简介 ''' pathBasic = '//*[contains(@class,"ant-tabs-nav-list")]/*[1]' path = '//*[contains(@class,"active")]//*[contains(@class,"bio")]' basic = this.__dv.find_elements_by_xpath(pathBasic) if not basic: return None basic = basic[0] this.__clickIfNotActive(basic, waitLoading, path) return getTextByXpath(this.__dv, path)
def getExperience(this): ''' 抽取scholar的工作经历 ''' pathBasic = '//*[contains(@class,"ant-tabs-nav-list")]/*[1]' path = '//*[contains(@class,"active")]//*[@class="aff_inst"]/div' basic = this.__dv.find_elements_by_xpath(pathBasic) if not basic: return None basic = basic[0] this.__clickIfNotActive(basic, waitLoading, path) return getTextByXpath(this.__dv, path)
def getPapers(this): ''' 抽取scholar的论文(按引用量降序的一页,以便去重) ''' ###“学术成果”标签,可点击 pathAchievements = '//*[contains(@class,"ant-tabs-nav-list")]/*[2]' ###发表论文or科研项目 pathPP = '//*[contains(@class,"ant-tabs-nav-list")]/*[2]/self::*[contains(@class,"active")]/../../../..//span[@class="title"]' ###“按引用量排序”标签,可点击 pathRefSort = '//*[contains(@class,"pubs_sort_line")]/div/*[2]' ###论文标签 pathPaper = '//*[contains(@class,"pubs_sort_line")]/div'\ '/*[2]/self::*[contains(@class,"active")]/../../../../..'\ '//*[@class="content"]' ###论文id相对路径 pathId = '..' ###论文题目相对路径 pathTitle = './/*[contains(@class,"title")]/span' ###论文作者相对路径 pathAuthor = './/*[contains(@class,"authors")]' ###论文期刊相对路径 pathVenue = './/*[contains(@class,"venue-line")]' ###论文引用量相对路径 pathCited = './/*[@class="cited"]/strong' ###加载等待中 pathLoading = '//*[contains(@class,"sk_chase")]' ###没有论文 pathNodata = '//div[contains(@class,"profilePapers___1bMnJ")]//img[contains(@src,"noData")]' ###是否有论文 pathHavePapers = '//div[contains(@class,"profilePapers___1bMnJ")]//*[@class="content"]' ###点击“学术成果” achievements = this.__dv.find_elements_by_xpath(pathAchievements) if not achievements: return [] achievements = achievements[0] ###没有“学术成果”键 if 'disabled' in achievements.get_attribute('class'): return [] this.__clickIfNotActive(achievements, waitLoading, pathPP) ###点击“按引用量排序” if not this.__dv.find_elements_by_xpath(pathHavePapers): return [] refSort = this.__dv.find_elements_by_xpath(pathRefSort) if not refSort: raise Exception("no sort by reference") refSort = refSort[0] this.__clickIfNotActive(refSort, waitLoading, pathPaper + '|' + pathNodata, 1) if not waitTillLoaded( this.__dv, waitLoading, value=pathLoading, waitUnit=waitUnit): raise Exception("load failed") if this.__dv.find_elements_by_xpath(pathNodata): return [] ###抽取结果 papers = this.__dv.find_elements_by_xpath(pathPaper) return [{ "id": stdAminerId(getAttributeByXpath(p, pathId, 'id')), "title": getTextByXpath(p, pathTitle), "authors": rmUnseen(getTextByXpath(p, pathAuthor)), "venue": getTextByXpath(p, pathVenue), "cited": getTextByXpath(p, pathCited) } for p in papers]
def getAddress(this): ''' 抽取scholar的住址 ''' path = '//*[contains(@class,"map-marker")]/../span' return getTextByXpath(this.__dv, path)
def getFax(this): ''' 抽取scholar的传真 ''' path = '//*[contains(@class,"fax")]/../span' return getTextByXpath(this.__dv, path)
def getEmail(this): ''' 抽取scholar的email ''' path = '//*[contains(@class,"envelope")]/../span' return getTextByXpath(this.__dv, path)
def getPhone(this): ''' 抽取scholar的电话 ''' path = '//*[contains(@class,"phone")]/../span' return getTextByXpath(this.__dv, path)
def getDepartment(this): ''' 抽取scholar的机构 ''' path = '//*[contains(@class,"institution")]/../span' return getTextByXpath(this.__dv, path)
def getTitle(this): ''' 抽取scholar的职称 ''' path = '//*[contains(@class,"briefcase")]/../span' return getTextByXpath(this.__dv, path)