Example #1
0
    def findPdfFromInfo(self, infoPageUrl):
        response = self.session.get(infoPageUrl, headers=self.headers)
        soup = BeautifulSoup(response.content, 'lxml')

        extract = soup.find('div', attrs={'id': 'gsc_title_gg'})
        if extract is None:
            return None

        #find pdf url
        tag = extract.find('span', attrs={'class': 'gsc_title_ggt'})
        if tag is not None and tag.text == "[PDF]" and not self.badSource(
                extract.find('a')):
            return PdfObj('url', extract.find('a')['href'])
        elif tag is not None:
            print('Non-PDF tag or bad source, using get it @ waterloo')

        potential_links = extract.findAll('div',
                                          attrs={'class': 'gsc_title_ggi'})
        for div in potential_links:
            text = div.text.strip()
            if text == 'Get It!@Waterloo':
                pdf_obj = self.getWatPDF(div.find('a')['href'])
                if pdf_obj is not None:
                    return pdf_obj
        return None
Example #2
0
 def getWatPDF(self, url, title=None):
     print(url)
     time.sleep(15)
     status = WatLibSeleniumParser.downloadFromWatLib(url, 'paper.pdf')
     if status is None:
         return None
     else:
         newPdf = PdfObj('local', 'paper.pdf')
         return newPdf
    def findPapersFromCitations(self, url, toload):
        response = SESSION.get(url)
        soup = BeautifulSoup(response.content, 'lxml')

        papers_ul = soup.find('ul', attrs={'id': 'documentListUl'})
        paper_divs = papers_ul.findAll('li')

        papers_list = []

        count = 0
        for pdiv in paper_divs:
            title = pdiv.find('span', attrs={
                'class': 'docTitle'
            }).text.replace('\n', '')
            link = pdiv.find('a', attrs={'class': 'outwardLink'}, href=True)

            #if there is no valid waterloo link, try to find one
            while link.find(
                    'img',
                    attrs={'title': 'GetIt!@Waterloo(opens in a new window)'
                           }) is None:
                link = pdiv.find('a',
                                 attrs={'class': 'outwardLink'},
                                 href=True)
                if link is None:
                    break

            new_pdf = None
            if link is not None:
                link = link['href']
                new_pdf = self.getWatPDF(link)

            if new_pdf is None:
                new_pdf = PdfObj('local')

            new_pdf.setTitle(title)
            papers_list.append(new_pdf)

            count += 1
            # only load num specified
            if (count >= toload):
                break

        return papers_list
    def findPapersFromCitations(self, url, toload):
        response = SESSION.get(url)
        soup = BeautifulSoup(response.content, 'lxml')

        papers_ul = soup.find('ul', attrs={'id':'documentListUl'})
        paper_divs = papers_ul.findAll('li')

        papers_list = []

        count = 0
        for pdiv in paper_divs:
            title = pdiv.find('span', attrs={'class':'docTitle'}).text.replace('\n', '')
            link = pdiv.find('a', attrs={'class':'outwardLink'}, href=True)

            #if there is no valid waterloo link, try to find one
            while link.find('img', attrs={'title':'GetIt!@Waterloo(opens in a new window)'}) is None:
                link = pdiv.find('a', attrs={'class':'outwardLink'}, href=True)
                if link is None:
                    break

            new_pdf = None
            if link is not None:
                link = link['href']
                new_pdf = self.getWatPDF(link)
            
            if new_pdf is None:
                new_pdf = PdfObj('local')

            new_pdf.setTitle(title)
            papers_list.append(new_pdf)

            count += 1
            # only load num specified
            if (count>=toload):
                break

        return papers_list

# sc = ScopusPdfExtractor()
# sc.getWatPDF('https://www-scopus-com.proxy.lib.uwaterloo.ca/redirect/linking.uri?targetURL=http%3a%2f%2fsfx.scholarsportal.info%2fwaterloo%3fsid%3dElsevier%3aScopus%26_service_type%3dgetFullTxt%26issn%3d16871472%26isbn%3d%26volume%3d2016%26issue%3d1%26spage%3d%26epage%3d%26pages%3d%26artnum%3d181%26date%3d2016%26id%3ddoi%3a10.1186%252fs13638-016-0680-7%26title%3dEurasip%2bJournal%2bon%2bWireless%2bCommunications%2band%2bNetworking%26atitle%3dUnderstanding%2bSTDMA%2bvia%2bcomputer%2bsimulation%253a%2bfeasibility%2bto%2bvehicular%2bsafety%2bapplications%252c%2bconfigurations%252c%2band%2btime%2bsynchronization%2berrors%26aufirst%3dJ.-H.%26auinit%3dJ.-H.%26auinit1%3dJ%26aulast%3dLim&locationID=2&categoryID=6&eid=2-s2.0-84981156913&issn=16871472&linkType=TemplateLinking&year=2016&zone=outwardlinks&origin=resultslist&dig=f27235173a5b4809def53afe4c6884f2&recordRank=4\
# ', pdfName='paper.pdf')
 def getWatPDF(self, url, title=None, pdfName='paper.pdf'):
     print('Getting pdf from WatLib')
     print(url)
     status = WATPARSER.downloadFromWatLib(url, 'paper.pdf')
     print('fnish here')
     if status is None:
         print('None status')
         return None
     else:
         try:
             newPdf = PdfObj('local', pdfName)
             return newPdf
         except KeyboardInterrupt:
             return WATPARSER.reset()
    def findPapersFromCitations(self, citationsUrl):
        response = self.session.get(citationsUrl, headers=self.headers)
        soup = BeautifulSoup(response.content, 'lxml')

        linkExtracts = soup.findAll('div', attrs={'class': 'gs_r'})
        pdfList = []

        if linkExtracts is None:
            return pdfList


        for extract in linkExtracts:
            title = extract.find('h3', attrs={'class': 'gs_rt'}).text
            if title is not None:
                title = re.sub('(\[.*\])', '', title)
            extract = extract.find('div', attrs={'class': 'gs_ggsm'})
            pdf_obj = PdfObj('local')
            pdf_obj.setTitle(title)
            print(pdf_obj.getTitle())

            if extract is None:
                print('Found PDF title but no PDF link. Returning only title: ' + str(pdf_obj.getTitle()))
                pdfList.append(pdf_obj)
                continue

            #this code will skip links with [HTML] tag and throw error for links that are only "Get it at UWaterloo"
            tag = extract.find('span', attrs={'class': 'gs_ctg2'})
            if tag is not None and tag.text == "[PDF]" and not self.badSource(extract.find('a')):
                pdf_obj.resetContent('url', extract.find('a')['href'])
                print('pdf url: ' + pdf_obj.getPathUrl() + ' has title ' + str(pdf_obj.getTitle()))
                pdfList.append(pdf_obj)
                continue
            elif tag is not None:
                print('Non-PDF tag, using get it @ waterloo')

            potential_links = extract.findAll('a')

            notFound = True
            for link in potential_links:
                if link.text.strip() == "Get It!@Waterloo":
                    print('Get It!@Waterloo')
                    url = SessionInitializer.ROOT_URL + link['href']
                    pdf_obj = self.getWatPDF(url)
                    if pdf_obj is not None:
                        pdf_obj.setTitle(title)
                        notFound = False
                    else:
                        pdf_obj = PdfObj('local')
                        pdf_obj.setTitle(title)
                    break                    

            if notFound:
                print('Found PDF title but no PDF content. Returning only title.' + str(pdf_obj.getTitle()))
            pdfList.append(pdf_obj)



        pdfList = [p for p in pdfList if p is not None]
        return pdfList
Example #7
0
    def findPapersFromCitations(self, citationsUrl):
        response = self.session.get(citationsUrl, headers=self.headers)
        soup = BeautifulSoup(response.content, 'lxml')

        linkExtracts = soup.findAll('div', attrs={'class': 'gs_r'})
        pdfList = []

        if linkExtracts is None:
            return pdfList

        for extract in linkExtracts:
            title = extract.find('h3', attrs={'class': 'gs_rt'}).text
            if title is not None:
                title = re.sub('(\[.*\])', '', title)
            extract = extract.find('div', attrs={'class': 'gs_ggsm'})
            pdf_obj = PdfObj('local')
            pdf_obj.setTitle(title)
            print(pdf_obj.getTitle())

            if extract is None:
                print(
                    'Found PDF title but no PDF link. Returning only title: ' +
                    str(pdf_obj.getTitle()))
                pdfList.append(pdf_obj)
                continue

            #this code will skip links with [HTML] tag and throw error for links that are only "Get it at UWaterloo"
            tag = extract.find('span', attrs={'class': 'gs_ctg2'})
            if tag is not None and tag.text == "[PDF]" and not self.badSource(
                    extract.find('a')):
                pdf_obj.resetContent('url', extract.find('a')['href'])
                print('pdf url: ' + pdf_obj.getPathUrl() + ' has title ' +
                      str(pdf_obj.getTitle()))
                pdfList.append(pdf_obj)
                continue
            elif tag is not None:
                print('Non-PDF tag, using get it @ waterloo')

            potential_links = extract.findAll('a')

            notFound = True
            for link in potential_links:
                if link.text.strip() == "Get It!@Waterloo":
                    print('Get It!@Waterloo')
                    url = SessionInitializer.ROOT_URL + link['href']
                    pdf_obj = self.getWatPDF(url)
                    if pdf_obj is not None:
                        pdf_obj.setTitle(title)
                        notFound = False
                    else:
                        pdf_obj = PdfObj('local')
                        pdf_obj.setTitle(title)
                    break

            if notFound:
                print(
                    'Found PDF title but no PDF content. Returning only title.'
                    + str(pdf_obj.getTitle()))
            pdfList.append(pdf_obj)

        pdfList = [p for p in pdfList if p is not None]
        return pdfList