def test_ieee():
	url = 'https://ieeexplore.ieee.org/document/915037'
	response = tools.requestsGet(url, headers=headers)
	print(response.text)
	html = etree.HTML(response.text)
	pdfTag = html.xpath('//*[@id="LayoutWrapper"]/div/div/div/div[5]/div[2]/xpl-root/xpl-document-details/div/div[1]/section[2]/div/xpl-document-header/section/div[2]/div/div/div[3]/div[2]/div[2]/xpl-document-toolbar/div/div/ul/li[1]/a')
	print(pdfTag)
Esempio n. 2
0
def getBaseInfoURL(conference, logPath='', warningPath=''):
    """获取会议内每个年份的xml文档地址

	:param conference: 会议名
	:param logPath: log日志地址
	:param warningPath: warning日志地址
	:return: 成功则返回xml地址集合
	"""
    baseUrl = 'https://dblp.org/db/'
    url = baseUrl + conference
    urls = []
    response = tools.requestsGet(url,
                                 headers=headers,
                                 logPath=logPath,
                                 warningPath=warningPath)
    #若获取页面失败,返回的response为空字串,返回的xmlsUrl为空列表
    if response == '':
        return urls
    html = etree.HTML(response.text)

    #会议返回的是每个年份的xml
    if conference.find('conf/') == 0:
        urls = html.xpath('//a[contains(@href, ".xml")]/attribute::href')
    #期刊返回的是每个volume的html
    # elif conference.find('journals/') == 0:
    # 	urls = html.xpath('//*[@id="main"]/ul/li/a')

    # for xmlUrl in xmlsUrl:
    #     print(xmlUrl)
    # print(len(xmlsUrl))
    return urls
def getPdfURL(MyWebdriver, logPath='', warningPath=''):

	pdfBaseURL = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber='
	#获取当前页面url(因为会跳转),所以在此取得实际url
	current_url = MyWebdriver.current_url()
	#获取论文的number,用于拼接pdf页面的url
	paperNumber = current_url.split('/')[-1].split('?')[0]
	pdfHtmlURL = pdfBaseURL + paperNumber
	# print('pdfHtmlURL: ' + pdfHtmlURL)

	pdfURL = ''

	#无法打开pdfHtml页面
	response = tools.requestsGet(pdfHtmlURL, headers=headers, times=2, logPath=logPath, warningPath=warningPath)
	if response == '':
		warningInfo = 'Can not get the pdfURL from this page, for failed to get the pdfHtml {0}'.format(pdfHtmlURL)
		tools.warning(warningInfo, warningPath)
		return pdfURL

	#获取pdfURL
	try:
		pdfHtml = etree.HTML(response.content)
		iframe = pdfHtml.xpath('//iframe')[0]
	except Exception as e:
		warningInfo = 'Can not get the pdfURL from this page {0}\n              Failed info: {1}'.format(pdfHtmlURL, repr(e))
		tools.warning(warningInfo, warningPath)
	else:
		pdfURL = iframe.get('src').split('?')[0]
		successInfo = 'Successfully get the pdfURL from this page'
		if pdfURL == '':
			successInfo = '!!!Successfully get the pdfURL from this page, but the pdfURL is None'
		tools.log(successInfo, logPath)
		# print(pdfURL)

	return pdfURL
def search(title, logPath='', warningPath=''):
    def compareTitle(title, resultTitle):
        title, resultTitle = [ x.replace(' ', '')\
                                .replace('-', '')\
                                .replace(',', '')\
                                .replace(':', '')\
                                .replace('.', '')\
                                .lower()
                               for x in [title, resultTitle]
                            ]
        return resultTitle.find(title) == 0

    title = title.strip('.')
    baseSearchUrl = 'https://dl.acm.org/results.cfm'
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36',
        'referer': 'https://dl.acm.org/results.cfm?',
        'upgrade-insecure-requests': '1',
    }
    data = {
        'query': title,
    }
    newURL = ''

    response = tools.requestsGet(baseSearchUrl,
                                 headers=headers,
                                 params=data,
                                 times=2,
                                 logPath=logPath,
                                 warningPath=warningPath)
    #请求搜索页面失败
    if response == '':
        warningInfo = 'Failed search <{0}> in ACM, For can not open the search page'.format(
            title)
        tools.warning(warningInfo, warningPath)
        return newURL

    #请求搜索页面成功
    html = etree.HTML(response.content)
    resultTitles = html.xpath('//*[@id="results"]/div[5]/div[1]/a/text()')
    #有对应正确结果
    if (len(resultTitles) > 0) and compareTitle(title, resultTitles[0]):
        newURL = html.xpath('//*[@id="results"]/div[5]/div[1]/a')[0].get(
            'href')
        newURL = 'https://dl.acm.org/' + newURL
        logInfo = 'Successfully find <{0}> in ACM,and newURL is {1}'.format(
            title, newURL)
        tools.log(logInfo, logPath)
    #无对应正确结果
    else:
        warningInfo = 'Failed to find <{0}> in ACM, For none matched result'.format(
            title)
        tools.log(warningInfo, warningPath)

    return newURL
def test_acm():
	url = 'https://dl.acm.org/citation.cfm?doid=3092627.3092632'
	response = tools.requestsGet(url, headers=headers)
	# print(response.content)
	# print(response.text)
	# with open('test.html', 'w', encoding='utf-8') as f:
	# 	f.write(response.text)
	# 	f.close()
	html = etree.HTML(response.text)
	# aTags = html.xpath('//a')
	# for aTag in aTags:
	# 	print(aTag.text)

	aTag = html.xpath('//a[contains(@title, "FullText PDF")]')[0]
	print(aTag.xpath('./text()'))
	imgTag = html.xpath('//a[contains(@title, "FullText PDF")]/img')[0]
	print(imgTag.tail)
def countInIEEE(folderPath):
    '''计算某会议某年份已下载的论文数量

	:param folderPath: 某会议某年份的文件夹路径
	:return: 某会议某年份已下载的论文数量
	'''

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36',
    }

    num = 0
    infoPath = os.path.join(folderPath, 'paperInfo.xml')
    with open(infoPath, 'r') as f:
        paperInfo = f.read()
        f.close()
    xml = etree.fromstring(paperInfo)
    hits = xml.xpath('//hit')
    for hit in hits:
        if (hit.find('./info/pages')) is None:
            continue

        waitingTime = random.randint(10, 40)
        time.sleep(waitingTime)
        print()
        title = hit.xpath('./info/title')[0].text
        # print(title)
        ee = hit.xpath('./info/ee')[0].text
        # print('ee: {0}'.format(ee))
        if ee:
            response = tools.requestsGet(ee, headers=headers)
            if response != '':
                # print('currentURL: {0}'.format(response.url))
                if response.url.find('https://ieeexplore.ieee.org/') == 0:
                    print('{0} in IEEE'.format(title))
                    num += 1
                    continue

        newURL = IEEE.search(title)
        if newURL != '':
            # print('newURL: {0}'.format(newURL))
            print('{0} in IEEE'.format(title))
            num += 1

    return num
def downloadPDF(pdfURL, pdfPath, logPath='', warningPath=''):
    """下载pdf

    :param pdfURL: pdf下载地址
    :param pdfPath: pdf存储地址
    :param logPath: log日志地址
    :param warningPath: warning日志地址
    :return: 存储pdf成功返回True,失败返回False
    """
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36',
    }

    #pdfURL为空
    if pdfURL == '' or pdfURL == None:
        warningInfo = 'Failed to download the paper, for pdfURL is none'
        tools.warning(warningInfo, warningPath)
        return False

    #下载pdf
    pdf = tools.requestsGet(pdfURL,
                            headers=headers,
                            logPath=logPath,
                            warningPath=warningPath)

    # 下载pdf失败
    if pdf == '':
        warningInfo = 'Failed to download the {0} from the page {1}'.format(
            pdfPath, pdfURL)
        tools.warning(warningInfo, warningPath)
        return False

    #下载成功
    logInfo = 'Successfully download {0}'.format(pdfPath)
    tools.log(logInfo, logPath)

    #写入成功则返回True,失败返回False
    return tools.write(pdf.content,
                       pdfPath,
                       mode='wb',
                       logPath=logPath,
                       warningPath=warningPath)
Esempio n. 8
0
def createBaseInfoXML(conference,
                      papersfolderPath=defaultPath,
                      logPath='',
                      warningPath=''):
    """构建会议里的每个年份的基本信息(xml文档包括年份、volume、出版社)

	:param conference: 会议/期刊名
	:param papersfolderPath: 存储论文集的根目录
	:param logPath: log日志路径(使用绝对路径)
	:param warningPath: warning日志路径(使用绝对路径)
	:return:
	"""
    baseUrl = 'https://dblp.org/'

    #若该文件夹已存在,则返回
    if os.path.exists(papersfolderPath + conference):
        # warningInfo = '{0} has exists already'.format(papersfolderPath + conference)
        # tools.warning(warningInfo, warningPath)
        return

    #为该会议/期刊创建一个文件夹
    os.makedirs(papersfolderPath + conference)

    if conference.find('conf/') == 0:
        #为该会议xml文档创建一个Conference最外层标签
        root = etree.Element('Conference', name=conference)
        # print(etree.tostring(root))
        xmlsUrl = getBaseInfoURL(conference,
                                 logPath=logPath,
                                 warningPath=warningPath)
        #遍历每个年份的xml文档
        for xmlUrl in xmlsUrl:
            response = tools.requestsGet(xmlUrl,
                                         headers=headers,
                                         logPath=logPath,
                                         warningPath=warningPath)

            #若获取页面失败,返回的response为空字串
            if response == '':
                warningInfo = 'Failed to get the dblps from the page {0}'.format(
                    xmlUrl)
                tools.warning(warningInfo, warningPath)
                continue

            xml = etree.fromstring(response.content)
            dblps = xml.xpath('/dblp')
            for dblp in dblps:
                # print(etree.tostring(dblp))
                year = dblp.xpath('//year')[0].text
                if int(year) >= projectInfo.firstYear:
                    url = dblp.xpath('//url')[0].text
                    name = url.split('/')[-1].split('.')[0]
                    url = baseUrl + url
                    publisher = dblp.xpath('//publisher')[0].text
                    root.append(dblp)
                    #构建每个年份的xml文档
                    createPaperInfoXML(url,
                                       name,
                                       conference,
                                       publisher,
                                       papersfolderPath=papersfolderPath,
                                       logPath=logPath,
                                       warningPath=warningPath)
                else:
                    break

        # print(etree.tostring(root))
        with open('{0}{1}/baseInfo.xml'.format(papersfolderPath, conference),
                  'wb') as f:
            f.write(etree.tostring(root))

    elif conference.find('journals/') == 0:
        journalsBaseURL = 'https://dblp.org/db/'
        response = tools.requestsGet(journalsBaseURL + conference,
                                     headers=headers,
                                     logPath=logPath,
                                     warningPath=warningPath)
        #若获取页面失败,返回的response为空字串,返回的xmlsUrl为空列表
        if response == '':
            return
        html = etree.HTML(response.text)
        volumes = html.xpath('//*[@id="main"]/ul/li/a')
        for volume in volumes:
            name = volume.text
            year = name.split(' ')[-1].split('/')[0]
            if int(year) >= projectInfo.firstYear:
                url = volume.xpath('./attribute::href')[0]
                createPaperInfoXML(url,
                                   name,
                                   conference,
                                   papersfolderPath=papersfolderPath,
                                   logPath=logPath,
                                   warningPath=warningPath)
    else:
        return

    successInfo = "{0}'s baseInfo setup".format(conference)
    tools.log(successInfo, logPath)
    tools.log('\n', logPath, hasTime=False)
Esempio n. 9
0
def createPaperInfoXML(paperInfoUrl,
                       name,
                       conference,
                       publisher='',
                       papersfolderPath=defaultPath,
                       logPath='',
                       warningPath=''):
    """构建每个年份里论文集的信息表(包括论文作者,标题,页码,下载地址)

	:param paperInfoUrl: 包含该年份所有论文信息的xml文档的url地址
	:param name: 该年份文件夹的名字
	:param conference: 该年份所属的会议
	:param publisher: 该年份论文的出版社
	:param papersfolderPath: 存储论文集根目录
	:param logPath: log日志路径(绝对路径)
	:param warningPath: warning日志路径(绝对路径)
	:return:
	"""

    name = tools.toFilename(name, False)

    #若该文件夹已存在,则返回
    if os.path.exists('{0}{1}/{2}'.format(papersfolderPath, conference, name)):
        # warningInfo = '{0}{1}/{2} has exists already'
        # tools.warning(warningInfo, warningPath)
        return

    response = tools.requestsGet(paperInfoUrl,
                                 headers=headers,
                                 logPath=logPath,
                                 warningPath=warningPath)
    #无法获取到该年份的页面
    if response == '':
        warningInfo = 'Failed to construct the {0}{1}/{2}/paperInfo.xml'.format(
            papersfolderPath, conference, name)
        tools.warning(warningInfo, warningPath)
        tools.warning('\n', warningPath, hasTime=False)
        return

    #查找该页面下收录了所有论文基础信息的xml文档地址
    html = etree.HTML(response.text)
    paperInfoXMLURLs = html.xpath(
        '//a[contains(@href, "format=xml")]/attribute::href')

    #没有找到相应的xml文档地址,可能是该年份有几个volumes或xpath表达式不匹配问题
    if len(paperInfoXMLURLs) <= 0:
        name = name + '-'
        volumeURLs = html.xpath(
            '//a[contains(@href, "{0}")]/attribute::href'.format(name))

        #也无发找到volume的地址,建议检查xpath表达式
        if len(volumeURLs) <= 0:
            warningInfo = 'Failed to get the xml address from the page {0}, Please check your xpath expression'.format(
                paperInfoUrl)
            tools.warning(warningInfo, warningPath)
            warningInfo = 'Failed to construct the {0}{1}/{2}/paperInfo.xml'.format(
                papersfolderPath, conference, name)
            tools.warning(warningInfo, warningPath)
            tools.warning('\n', warningPath, hasTime=False)
            return

        #找到volume地址,遍历每个volume,给每个volume建立xml文档
        for volumeURL in volumeURLs:
            name = volumeURL.split('/')[-1].split('.')[0]
            createPaperInfoXML(volumeURL,
                               name,
                               conference,
                               publisher,
                               logPath=logPath,
                               warningPath=warningPath)
        return

    #找到xml文档地址并打开
    paperInfoXMLURL = paperInfoXMLURLs[0]
    response = tools.requestsGet(paperInfoXMLURL,
                                 headers=headers,
                                 logPath=logPath,
                                 warningPath=warningPath)

    #获取xml文档失败
    if response == '':
        warningInfo = 'Failed to get the xml document {0}'.format(
            paperInfoXMLURL)
        tools.warning(warningInfo, warningPath)
        warningInfo = 'Failed to construct the {0}{1}/{2}/paperInfo.xml'.format(
            papersfolderPath, conference, name)
        tools.warning(warningInfo, warningPath)
        tools.warning('\n', warningPath, hasTime=False)
        return

    #成功获取xml文档
    paperInfoXML = etree.XML(response.content)
    paperInfo = paperInfoXML.xpath('//hits')[0]

    #创建xml文档最外层标签
    root = etree.Element('Year', name=name, publisher=publisher)

    #对xml的element取len长度,返回其子节点个数
    #paperInfo的长度为0时,表示一次性抓取所有论文的Info失败时,需一个一个抓取论文的Info,再拼接到root中
    if len(paperInfo) <= 0:
        #获取该年份\volume下的所有论文的xml文档地址
        paperInfoXMLURLs = html.xpath(
            '//a[contains(@href, ".xml")]/attribute::href')[1:]
        paperNum = len(paperInfoXMLURLs)
        hits = etree.Element('hits',
                             total=str(len(paperInfoXMLURLs)),
                             completed='0',
                             current='0')

        logInfo = 'Getting paper infos one by one, {0}/{1} has {2} papers'.format(
            conference, name, paperNum)
        tools.log(logInfo, logPath)

        #遍历存储论文信息的xml文档,将论文的信息拼接到一起
        for paperInfoXMLURL in paperInfoXMLURLs:
            response = tools.requestsGet(paperInfoXMLURL,
                                         headers=headers,
                                         logPath=logPath,
                                         warningPath=warningPath)
            #打开存储论文信息的xml文档失败
            if response == '':
                warningInfo = "Failed to get the paper's info from the page {0}".format(
                    paperInfoXMLURL)
                tools.warning(warningInfo, warningPath)
                continue

            #成功打开存储论文信息的xml文档
            paperInfoXML = etree.XML(response.content)
            hit = paperInfoXML.xpath('//dblp')[0]
            hit.tag = 'hit'
            hit.set('hasDownloadPDF', 'False')
            hit.set('hasSolved', 'False')
            info = paperInfoXML.xpath('//inproceedings')[0]
            info.tag = 'info'
            hits.append(hit)

        root.append(hits)

    #可以一次性获取到所有论文的Info
    else:
        paperInfo.set('completed', '0')
        paperInfo.set('current', '0')
        paperNum = 0
        # print(paperInfo.get('total'))
        hits = paperInfo.xpath('.//hit')
        for hit in hits:
            hit.set('hasDownloadPDF', 'False')
            hit.set('hasSolved', 'False')
            #统计论文数
            if hit.find('.//pages') is not None:
                paperNum += 1
        paperInfo.set('total', str(paperNum))
        root.append(paperInfo)

    #为每个年份\volume创建文件夹
    os.makedirs('{0}{1}/{2}'.format(papersfolderPath, conference, name))
    with open(
            '{0}{1}/{2}/paperInfo.xml'.format(papersfolderPath, conference,
                                              name), 'wb') as f:
        f.write(etree.tostring(root))
    successInfo = "{0}/{1}'s baseInfo set up, it has {2} papers totally".format(
        conference, name, paperNum)
    tools.log(successInfo, logPath)
    tools.log('\n', logPath, hasTime=False)