def getWatPDF(self, url, title=None): print(url) time.sleep(15) status = WatLibSeleniumParser.downloadFromWatLib(url, 'paper.pdf') if status is None: return None else: newPdf = PdfObj('local', 'paper.pdf') return newPdf
def count_overcites_paper(paper, author, cite_num_to_load=40): overcites_info = [] try: all_pdfObjs = paper.getCitingPdfs(cite_num_to_load) analyzer = PaperReferenceExtractor() for idx, pdf in enumerate(all_pdfObjs): content = analyzer.getReferencesContent(pdf) title = pdf.getTitle() if content is None and title is not None: print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = "No PDF Found" overcites_info.append(info_dict) continue elif content is None: continue # print(content) lname = author.getLastName().title() numCites = analyzer.getCitesToAuthor(lname, content) if title is None: title = 'Unknown Title' print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = numCites overcites_info.append(info_dict) except KeyboardInterrupt: print('User ended program. Returning existing Data') WatLibSeleniumParser.reset() return overcites_info return overcites_info
def getWatPDF(self, url, title=None, pdfName='paper.pdf'): print('Getting pdf from WatLib') print(url) status = WatLibSeleniumParser.downloadFromWatLib(url, 'paper.pdf') if status is None: print('None status') return None else: # try: newPdf = PdfObj('local', pdfName) # except KeyboardInterrupt: # return None return newPdf
''' Created on Jan 05, 2016 @author: Ankai ''' from bs4 import BeautifulSoup import time from ReferenceParser import IeeeReferenceParser, SpringerReferenceParser, PaperReferenceExtractor, PdfObj import SessionInitializer import WatLibSeleniumParser SESSION = SessionInitializer.getSesh() WATPARSER = WatLibSeleniumParser.WatLibParser() class Paper: def __init__(self, link, loadPaperPDFs=True): self.url = link self.pdfObj = None self.pap_info = {} #self.__pap_info['Publisher'] = '' self.citedByUrl = None self.citedByNum = 0 #Internet Session Setup self.loadFromScopus(loadPaperPDFs=loadPaperPDFs) def loadFromScopus(self, loadPaperPDFs=True): response = SESSION.get(self.url) soup = BeautifulSoup(response.content, 'lxml')