def findPdfFromInfo(self, infoPageUrl): response = self.session.get(infoPageUrl, headers=self.headers) soup = BeautifulSoup(response.content, 'lxml') extract = soup.find('div', attrs={'id': 'gsc_title_gg'}) if extract is None: return None #find pdf url tag = extract.find('span', attrs={'class': 'gsc_title_ggt'}) if tag is not None and tag.text == "[PDF]" and not self.badSource( extract.find('a')): return PdfObj('url', extract.find('a')['href']) elif tag is not None: print('Non-PDF tag or bad source, using get it @ waterloo') potential_links = extract.findAll('div', attrs={'class': 'gsc_title_ggi'}) for div in potential_links: text = div.text.strip() if text == 'Get It!@Waterloo': pdf_obj = self.getWatPDF(div.find('a')['href']) if pdf_obj is not None: return pdf_obj return None
def getWatPDF(self, url, title=None): print(url) time.sleep(15) status = WatLibSeleniumParser.downloadFromWatLib(url, 'paper.pdf') if status is None: return None else: newPdf = PdfObj('local', 'paper.pdf') return newPdf
def findPapersFromCitations(self, url, toload): response = SESSION.get(url) soup = BeautifulSoup(response.content, 'lxml') papers_ul = soup.find('ul', attrs={'id': 'documentListUl'}) paper_divs = papers_ul.findAll('li') papers_list = [] count = 0 for pdiv in paper_divs: title = pdiv.find('span', attrs={ 'class': 'docTitle' }).text.replace('\n', '') link = pdiv.find('a', attrs={'class': 'outwardLink'}, href=True) #if there is no valid waterloo link, try to find one while link.find( 'img', attrs={'title': 'GetIt!@Waterloo(opens in a new window)' }) is None: link = pdiv.find('a', attrs={'class': 'outwardLink'}, href=True) if link is None: break new_pdf = None if link is not None: link = link['href'] new_pdf = self.getWatPDF(link) if new_pdf is None: new_pdf = PdfObj('local') new_pdf.setTitle(title) papers_list.append(new_pdf) count += 1 # only load num specified if (count >= toload): break return papers_list
def findPapersFromCitations(self, url, toload): response = SESSION.get(url) soup = BeautifulSoup(response.content, 'lxml') papers_ul = soup.find('ul', attrs={'id':'documentListUl'}) paper_divs = papers_ul.findAll('li') papers_list = [] count = 0 for pdiv in paper_divs: title = pdiv.find('span', attrs={'class':'docTitle'}).text.replace('\n', '') link = pdiv.find('a', attrs={'class':'outwardLink'}, href=True) #if there is no valid waterloo link, try to find one while link.find('img', attrs={'title':'GetIt!@Waterloo(opens in a new window)'}) is None: link = pdiv.find('a', attrs={'class':'outwardLink'}, href=True) if link is None: break new_pdf = None if link is not None: link = link['href'] new_pdf = self.getWatPDF(link) if new_pdf is None: new_pdf = PdfObj('local') new_pdf.setTitle(title) papers_list.append(new_pdf) count += 1 # only load num specified if (count>=toload): break return papers_list # sc = ScopusPdfExtractor() # sc.getWatPDF('https://www-scopus-com.proxy.lib.uwaterloo.ca/redirect/linking.uri?targetURL=http%3a%2f%2fsfx.scholarsportal.info%2fwaterloo%3fsid%3dElsevier%3aScopus%26_service_type%3dgetFullTxt%26issn%3d16871472%26isbn%3d%26volume%3d2016%26issue%3d1%26spage%3d%26epage%3d%26pages%3d%26artnum%3d181%26date%3d2016%26id%3ddoi%3a10.1186%252fs13638-016-0680-7%26title%3dEurasip%2bJournal%2bon%2bWireless%2bCommunications%2band%2bNetworking%26atitle%3dUnderstanding%2bSTDMA%2bvia%2bcomputer%2bsimulation%253a%2bfeasibility%2bto%2bvehicular%2bsafety%2bapplications%252c%2bconfigurations%252c%2band%2btime%2bsynchronization%2berrors%26aufirst%3dJ.-H.%26auinit%3dJ.-H.%26auinit1%3dJ%26aulast%3dLim&locationID=2&categoryID=6&eid=2-s2.0-84981156913&issn=16871472&linkType=TemplateLinking&year=2016&zone=outwardlinks&origin=resultslist&dig=f27235173a5b4809def53afe4c6884f2&recordRank=4\ # ', pdfName='paper.pdf')
def getWatPDF(self, url, title=None, pdfName='paper.pdf'): print('Getting pdf from WatLib') print(url) status = WATPARSER.downloadFromWatLib(url, 'paper.pdf') print('fnish here') if status is None: print('None status') return None else: try: newPdf = PdfObj('local', pdfName) return newPdf except KeyboardInterrupt: return WATPARSER.reset()
def findPapersFromCitations(self, citationsUrl): response = self.session.get(citationsUrl, headers=self.headers) soup = BeautifulSoup(response.content, 'lxml') linkExtracts = soup.findAll('div', attrs={'class': 'gs_r'}) pdfList = [] if linkExtracts is None: return pdfList for extract in linkExtracts: title = extract.find('h3', attrs={'class': 'gs_rt'}).text if title is not None: title = re.sub('(\[.*\])', '', title) extract = extract.find('div', attrs={'class': 'gs_ggsm'}) pdf_obj = PdfObj('local') pdf_obj.setTitle(title) print(pdf_obj.getTitle()) if extract is None: print('Found PDF title but no PDF link. Returning only title: ' + str(pdf_obj.getTitle())) pdfList.append(pdf_obj) continue #this code will skip links with [HTML] tag and throw error for links that are only "Get it at UWaterloo" tag = extract.find('span', attrs={'class': 'gs_ctg2'}) if tag is not None and tag.text == "[PDF]" and not self.badSource(extract.find('a')): pdf_obj.resetContent('url', extract.find('a')['href']) print('pdf url: ' + pdf_obj.getPathUrl() + ' has title ' + str(pdf_obj.getTitle())) pdfList.append(pdf_obj) continue elif tag is not None: print('Non-PDF tag, using get it @ waterloo') potential_links = extract.findAll('a') notFound = True for link in potential_links: if link.text.strip() == "Get It!@Waterloo": print('Get It!@Waterloo') url = SessionInitializer.ROOT_URL + link['href'] pdf_obj = self.getWatPDF(url) if pdf_obj is not None: pdf_obj.setTitle(title) notFound = False else: pdf_obj = PdfObj('local') pdf_obj.setTitle(title) break if notFound: print('Found PDF title but no PDF content. Returning only title.' + str(pdf_obj.getTitle())) pdfList.append(pdf_obj) pdfList = [p for p in pdfList if p is not None] return pdfList
def findPapersFromCitations(self, citationsUrl): response = self.session.get(citationsUrl, headers=self.headers) soup = BeautifulSoup(response.content, 'lxml') linkExtracts = soup.findAll('div', attrs={'class': 'gs_r'}) pdfList = [] if linkExtracts is None: return pdfList for extract in linkExtracts: title = extract.find('h3', attrs={'class': 'gs_rt'}).text if title is not None: title = re.sub('(\[.*\])', '', title) extract = extract.find('div', attrs={'class': 'gs_ggsm'}) pdf_obj = PdfObj('local') pdf_obj.setTitle(title) print(pdf_obj.getTitle()) if extract is None: print( 'Found PDF title but no PDF link. Returning only title: ' + str(pdf_obj.getTitle())) pdfList.append(pdf_obj) continue #this code will skip links with [HTML] tag and throw error for links that are only "Get it at UWaterloo" tag = extract.find('span', attrs={'class': 'gs_ctg2'}) if tag is not None and tag.text == "[PDF]" and not self.badSource( extract.find('a')): pdf_obj.resetContent('url', extract.find('a')['href']) print('pdf url: ' + pdf_obj.getPathUrl() + ' has title ' + str(pdf_obj.getTitle())) pdfList.append(pdf_obj) continue elif tag is not None: print('Non-PDF tag, using get it @ waterloo') potential_links = extract.findAll('a') notFound = True for link in potential_links: if link.text.strip() == "Get It!@Waterloo": print('Get It!@Waterloo') url = SessionInitializer.ROOT_URL + link['href'] pdf_obj = self.getWatPDF(url) if pdf_obj is not None: pdf_obj.setTitle(title) notFound = False else: pdf_obj = PdfObj('local') pdf_obj.setTitle(title) break if notFound: print( 'Found PDF title but no PDF content. Returning only title.' + str(pdf_obj.getTitle())) pdfList.append(pdf_obj) pdfList = [p for p in pdfList if p is not None] return pdfList