def test_ieee(): url = 'https://ieeexplore.ieee.org/document/915037' response = tools.requestsGet(url, headers=headers) print(response.text) html = etree.HTML(response.text) pdfTag = html.xpath('//*[@id="LayoutWrapper"]/div/div/div/div[5]/div[2]/xpl-root/xpl-document-details/div/div[1]/section[2]/div/xpl-document-header/section/div[2]/div/div/div[3]/div[2]/div[2]/xpl-document-toolbar/div/div/ul/li[1]/a') print(pdfTag)
def getBaseInfoURL(conference, logPath='', warningPath=''): """获取会议内每个年份的xml文档地址 :param conference: 会议名 :param logPath: log日志地址 :param warningPath: warning日志地址 :return: 成功则返回xml地址集合 """ baseUrl = 'https://dblp.org/db/' url = baseUrl + conference urls = [] response = tools.requestsGet(url, headers=headers, logPath=logPath, warningPath=warningPath) #若获取页面失败,返回的response为空字串,返回的xmlsUrl为空列表 if response == '': return urls html = etree.HTML(response.text) #会议返回的是每个年份的xml if conference.find('conf/') == 0: urls = html.xpath('//a[contains(@href, ".xml")]/attribute::href') #期刊返回的是每个volume的html # elif conference.find('journals/') == 0: # urls = html.xpath('//*[@id="main"]/ul/li/a') # for xmlUrl in xmlsUrl: # print(xmlUrl) # print(len(xmlsUrl)) return urls
def getPdfURL(MyWebdriver, logPath='', warningPath=''): pdfBaseURL = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=' #获取当前页面url(因为会跳转),所以在此取得实际url current_url = MyWebdriver.current_url() #获取论文的number,用于拼接pdf页面的url paperNumber = current_url.split('/')[-1].split('?')[0] pdfHtmlURL = pdfBaseURL + paperNumber # print('pdfHtmlURL: ' + pdfHtmlURL) pdfURL = '' #无法打开pdfHtml页面 response = tools.requestsGet(pdfHtmlURL, headers=headers, times=2, logPath=logPath, warningPath=warningPath) if response == '': warningInfo = 'Can not get the pdfURL from this page, for failed to get the pdfHtml {0}'.format(pdfHtmlURL) tools.warning(warningInfo, warningPath) return pdfURL #获取pdfURL try: pdfHtml = etree.HTML(response.content) iframe = pdfHtml.xpath('//iframe')[0] except Exception as e: warningInfo = 'Can not get the pdfURL from this page {0}\n Failed info: {1}'.format(pdfHtmlURL, repr(e)) tools.warning(warningInfo, warningPath) else: pdfURL = iframe.get('src').split('?')[0] successInfo = 'Successfully get the pdfURL from this page' if pdfURL == '': successInfo = '!!!Successfully get the pdfURL from this page, but the pdfURL is None' tools.log(successInfo, logPath) # print(pdfURL) return pdfURL
def search(title, logPath='', warningPath=''): def compareTitle(title, resultTitle): title, resultTitle = [ x.replace(' ', '')\ .replace('-', '')\ .replace(',', '')\ .replace(':', '')\ .replace('.', '')\ .lower() for x in [title, resultTitle] ] return resultTitle.find(title) == 0 title = title.strip('.') baseSearchUrl = 'https://dl.acm.org/results.cfm' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36', 'referer': 'https://dl.acm.org/results.cfm?', 'upgrade-insecure-requests': '1', } data = { 'query': title, } newURL = '' response = tools.requestsGet(baseSearchUrl, headers=headers, params=data, times=2, logPath=logPath, warningPath=warningPath) #请求搜索页面失败 if response == '': warningInfo = 'Failed search <{0}> in ACM, For can not open the search page'.format( title) tools.warning(warningInfo, warningPath) return newURL #请求搜索页面成功 html = etree.HTML(response.content) resultTitles = html.xpath('//*[@id="results"]/div[5]/div[1]/a/text()') #有对应正确结果 if (len(resultTitles) > 0) and compareTitle(title, resultTitles[0]): newURL = html.xpath('//*[@id="results"]/div[5]/div[1]/a')[0].get( 'href') newURL = 'https://dl.acm.org/' + newURL logInfo = 'Successfully find <{0}> in ACM,and newURL is {1}'.format( title, newURL) tools.log(logInfo, logPath) #无对应正确结果 else: warningInfo = 'Failed to find <{0}> in ACM, For none matched result'.format( title) tools.log(warningInfo, warningPath) return newURL
def test_acm(): url = 'https://dl.acm.org/citation.cfm?doid=3092627.3092632' response = tools.requestsGet(url, headers=headers) # print(response.content) # print(response.text) # with open('test.html', 'w', encoding='utf-8') as f: # f.write(response.text) # f.close() html = etree.HTML(response.text) # aTags = html.xpath('//a') # for aTag in aTags: # print(aTag.text) aTag = html.xpath('//a[contains(@title, "FullText PDF")]')[0] print(aTag.xpath('./text()')) imgTag = html.xpath('//a[contains(@title, "FullText PDF")]/img')[0] print(imgTag.tail)
def countInIEEE(folderPath): '''计算某会议某年份已下载的论文数量 :param folderPath: 某会议某年份的文件夹路径 :return: 某会议某年份已下载的论文数量 ''' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36', } num = 0 infoPath = os.path.join(folderPath, 'paperInfo.xml') with open(infoPath, 'r') as f: paperInfo = f.read() f.close() xml = etree.fromstring(paperInfo) hits = xml.xpath('//hit') for hit in hits: if (hit.find('./info/pages')) is None: continue waitingTime = random.randint(10, 40) time.sleep(waitingTime) print() title = hit.xpath('./info/title')[0].text # print(title) ee = hit.xpath('./info/ee')[0].text # print('ee: {0}'.format(ee)) if ee: response = tools.requestsGet(ee, headers=headers) if response != '': # print('currentURL: {0}'.format(response.url)) if response.url.find('https://ieeexplore.ieee.org/') == 0: print('{0} in IEEE'.format(title)) num += 1 continue newURL = IEEE.search(title) if newURL != '': # print('newURL: {0}'.format(newURL)) print('{0} in IEEE'.format(title)) num += 1 return num
def downloadPDF(pdfURL, pdfPath, logPath='', warningPath=''): """下载pdf :param pdfURL: pdf下载地址 :param pdfPath: pdf存储地址 :param logPath: log日志地址 :param warningPath: warning日志地址 :return: 存储pdf成功返回True,失败返回False """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36', } #pdfURL为空 if pdfURL == '' or pdfURL == None: warningInfo = 'Failed to download the paper, for pdfURL is none' tools.warning(warningInfo, warningPath) return False #下载pdf pdf = tools.requestsGet(pdfURL, headers=headers, logPath=logPath, warningPath=warningPath) # 下载pdf失败 if pdf == '': warningInfo = 'Failed to download the {0} from the page {1}'.format( pdfPath, pdfURL) tools.warning(warningInfo, warningPath) return False #下载成功 logInfo = 'Successfully download {0}'.format(pdfPath) tools.log(logInfo, logPath) #写入成功则返回True,失败返回False return tools.write(pdf.content, pdfPath, mode='wb', logPath=logPath, warningPath=warningPath)
def createBaseInfoXML(conference, papersfolderPath=defaultPath, logPath='', warningPath=''): """构建会议里的每个年份的基本信息(xml文档包括年份、volume、出版社) :param conference: 会议/期刊名 :param papersfolderPath: 存储论文集的根目录 :param logPath: log日志路径(使用绝对路径) :param warningPath: warning日志路径(使用绝对路径) :return: """ baseUrl = 'https://dblp.org/' #若该文件夹已存在,则返回 if os.path.exists(papersfolderPath + conference): # warningInfo = '{0} has exists already'.format(papersfolderPath + conference) # tools.warning(warningInfo, warningPath) return #为该会议/期刊创建一个文件夹 os.makedirs(papersfolderPath + conference) if conference.find('conf/') == 0: #为该会议xml文档创建一个Conference最外层标签 root = etree.Element('Conference', name=conference) # print(etree.tostring(root)) xmlsUrl = getBaseInfoURL(conference, logPath=logPath, warningPath=warningPath) #遍历每个年份的xml文档 for xmlUrl in xmlsUrl: response = tools.requestsGet(xmlUrl, headers=headers, logPath=logPath, warningPath=warningPath) #若获取页面失败,返回的response为空字串 if response == '': warningInfo = 'Failed to get the dblps from the page {0}'.format( xmlUrl) tools.warning(warningInfo, warningPath) continue xml = etree.fromstring(response.content) dblps = xml.xpath('/dblp') for dblp in dblps: # print(etree.tostring(dblp)) year = dblp.xpath('//year')[0].text if int(year) >= projectInfo.firstYear: url = dblp.xpath('//url')[0].text name = url.split('/')[-1].split('.')[0] url = baseUrl + url publisher = dblp.xpath('//publisher')[0].text root.append(dblp) #构建每个年份的xml文档 createPaperInfoXML(url, name, conference, publisher, papersfolderPath=papersfolderPath, logPath=logPath, warningPath=warningPath) else: break # print(etree.tostring(root)) with open('{0}{1}/baseInfo.xml'.format(papersfolderPath, conference), 'wb') as f: f.write(etree.tostring(root)) elif conference.find('journals/') == 0: journalsBaseURL = 'https://dblp.org/db/' response = tools.requestsGet(journalsBaseURL + conference, headers=headers, logPath=logPath, warningPath=warningPath) #若获取页面失败,返回的response为空字串,返回的xmlsUrl为空列表 if response == '': return html = etree.HTML(response.text) volumes = html.xpath('//*[@id="main"]/ul/li/a') for volume in volumes: name = volume.text year = name.split(' ')[-1].split('/')[0] if int(year) >= projectInfo.firstYear: url = volume.xpath('./attribute::href')[0] createPaperInfoXML(url, name, conference, papersfolderPath=papersfolderPath, logPath=logPath, warningPath=warningPath) else: return successInfo = "{0}'s baseInfo setup".format(conference) tools.log(successInfo, logPath) tools.log('\n', logPath, hasTime=False)
def createPaperInfoXML(paperInfoUrl, name, conference, publisher='', papersfolderPath=defaultPath, logPath='', warningPath=''): """构建每个年份里论文集的信息表(包括论文作者,标题,页码,下载地址) :param paperInfoUrl: 包含该年份所有论文信息的xml文档的url地址 :param name: 该年份文件夹的名字 :param conference: 该年份所属的会议 :param publisher: 该年份论文的出版社 :param papersfolderPath: 存储论文集根目录 :param logPath: log日志路径(绝对路径) :param warningPath: warning日志路径(绝对路径) :return: """ name = tools.toFilename(name, False) #若该文件夹已存在,则返回 if os.path.exists('{0}{1}/{2}'.format(papersfolderPath, conference, name)): # warningInfo = '{0}{1}/{2} has exists already' # tools.warning(warningInfo, warningPath) return response = tools.requestsGet(paperInfoUrl, headers=headers, logPath=logPath, warningPath=warningPath) #无法获取到该年份的页面 if response == '': warningInfo = 'Failed to construct the {0}{1}/{2}/paperInfo.xml'.format( papersfolderPath, conference, name) tools.warning(warningInfo, warningPath) tools.warning('\n', warningPath, hasTime=False) return #查找该页面下收录了所有论文基础信息的xml文档地址 html = etree.HTML(response.text) paperInfoXMLURLs = html.xpath( '//a[contains(@href, "format=xml")]/attribute::href') #没有找到相应的xml文档地址,可能是该年份有几个volumes或xpath表达式不匹配问题 if len(paperInfoXMLURLs) <= 0: name = name + '-' volumeURLs = html.xpath( '//a[contains(@href, "{0}")]/attribute::href'.format(name)) #也无发找到volume的地址,建议检查xpath表达式 if len(volumeURLs) <= 0: warningInfo = 'Failed to get the xml address from the page {0}, Please check your xpath expression'.format( paperInfoUrl) tools.warning(warningInfo, warningPath) warningInfo = 'Failed to construct the {0}{1}/{2}/paperInfo.xml'.format( papersfolderPath, conference, name) tools.warning(warningInfo, warningPath) tools.warning('\n', warningPath, hasTime=False) return #找到volume地址,遍历每个volume,给每个volume建立xml文档 for volumeURL in volumeURLs: name = volumeURL.split('/')[-1].split('.')[0] createPaperInfoXML(volumeURL, name, conference, publisher, logPath=logPath, warningPath=warningPath) return #找到xml文档地址并打开 paperInfoXMLURL = paperInfoXMLURLs[0] response = tools.requestsGet(paperInfoXMLURL, headers=headers, logPath=logPath, warningPath=warningPath) #获取xml文档失败 if response == '': warningInfo = 'Failed to get the xml document {0}'.format( paperInfoXMLURL) tools.warning(warningInfo, warningPath) warningInfo = 'Failed to construct the {0}{1}/{2}/paperInfo.xml'.format( papersfolderPath, conference, name) tools.warning(warningInfo, warningPath) tools.warning('\n', warningPath, hasTime=False) return #成功获取xml文档 paperInfoXML = etree.XML(response.content) paperInfo = paperInfoXML.xpath('//hits')[0] #创建xml文档最外层标签 root = etree.Element('Year', name=name, publisher=publisher) #对xml的element取len长度,返回其子节点个数 #paperInfo的长度为0时,表示一次性抓取所有论文的Info失败时,需一个一个抓取论文的Info,再拼接到root中 if len(paperInfo) <= 0: #获取该年份\volume下的所有论文的xml文档地址 paperInfoXMLURLs = html.xpath( '//a[contains(@href, ".xml")]/attribute::href')[1:] paperNum = len(paperInfoXMLURLs) hits = etree.Element('hits', total=str(len(paperInfoXMLURLs)), completed='0', current='0') logInfo = 'Getting paper infos one by one, {0}/{1} has {2} papers'.format( conference, name, paperNum) tools.log(logInfo, logPath) #遍历存储论文信息的xml文档,将论文的信息拼接到一起 for paperInfoXMLURL in paperInfoXMLURLs: response = tools.requestsGet(paperInfoXMLURL, headers=headers, logPath=logPath, warningPath=warningPath) #打开存储论文信息的xml文档失败 if response == '': warningInfo = "Failed to get the paper's info from the page {0}".format( paperInfoXMLURL) tools.warning(warningInfo, warningPath) continue #成功打开存储论文信息的xml文档 paperInfoXML = etree.XML(response.content) hit = paperInfoXML.xpath('//dblp')[0] hit.tag = 'hit' hit.set('hasDownloadPDF', 'False') hit.set('hasSolved', 'False') info = paperInfoXML.xpath('//inproceedings')[0] info.tag = 'info' hits.append(hit) root.append(hits) #可以一次性获取到所有论文的Info else: paperInfo.set('completed', '0') paperInfo.set('current', '0') paperNum = 0 # print(paperInfo.get('total')) hits = paperInfo.xpath('.//hit') for hit in hits: hit.set('hasDownloadPDF', 'False') hit.set('hasSolved', 'False') #统计论文数 if hit.find('.//pages') is not None: paperNum += 1 paperInfo.set('total', str(paperNum)) root.append(paperInfo) #为每个年份\volume创建文件夹 os.makedirs('{0}{1}/{2}'.format(papersfolderPath, conference, name)) with open( '{0}{1}/{2}/paperInfo.xml'.format(papersfolderPath, conference, name), 'wb') as f: f.write(etree.tostring(root)) successInfo = "{0}/{1}'s baseInfo set up, it has {2} papers totally".format( conference, name, paperNum) tools.log(successInfo, logPath) tools.log('\n', logPath, hasTime=False)