def overcite_an(path): titles = [] with open(path, "r", encoding='latin1') as file: reader = csv.reader(file) for idx,line in enumerate(reader): if idx>0: t=[line[1].strip(),line[2]] if t[0] != 'Paper Title' and t[0] != 'Total' and t[1].isdigit(): titles.append([t[0], int(t[1])]) titles = [t for t in titles if t[1] >=25] print(len(titles)) titles = dedupe(titles) print(len(titles)) s = 0 for t in titles: s += t[1] print(s) session = SessionInitializer.getSesh() headers = SessionInitializer.getHeaders() final_dict = {'authors': {}, 'journals': {}, 'publishers': {}} for t in titles: paper_name = t[0] paper_cites = t[1] query = "+".join(paper_name.split()) url = SessionInitializer.ROOT_URL + '/scholar?q=' + query + '&btnG=&hl=en&as_sdt=0%2C5' print(url) response = session.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') info_list = "" try: info_list = soup.find('div', attrs={'class' : 'gs_a'}).text except: print('cant find for ' + paper_name) continue info_list = [i.strip() for i in info_list.split(' - ')] authors = info_list[0].split(',') journal = info_list[1].split(',')[0] publisher = info_list[2] for a in authors: add_freq_dict(final_dict['authors'], a) add_freq_dict(final_dict['journals'], journal) add_freq_dict(final_dict['publishers'], publisher) print(publisher) print(final_dict)
def __init__ (self, link, loadPdf=True): self.__url = link self.__pdfObj = None self.__pap_info = {} self.__pap_info['Publisher'] = '' self.__citedByUrl = None self.__citedByNum = 0 self.__allAuthors = None #Internet Session Setup self.session = SessionInitializer.getSesh() self.headers = SessionInitializer.getHeaders() self.loadFromGoogleScholar(loadPdf=loadPdf)
def __init__(self, mainUrl, numPapers, loadPaperPDFs=True, pubFilter=False): self.first_name = None self.last_name = None self.url = None self.__paper_list = [] #Internet Session Setup self.session = SessionInitializer.getSesh() self.headers = SessionInitializer.getHeaders() if (mainUrl is not None): self.url = mainUrl self.loadPapers(numPapers, loadPaperPDFs=loadPaperPDFs, pubFilter=pubFilter)
def __init__(self, link, loadPdf=True): self.__url = link self.__pdfObj = None self.__pap_info = {} self.__pap_info['Publisher'] = '' self.__citedByUrl = None self.__citedByNum = 0 self.__allAuthors = None #Internet Session Setup self.session = SessionInitializer.getSesh() self.headers = SessionInitializer.getHeaders() self.loadFromGoogleScholar(loadPdf=loadPdf)
def downloadSpringerOpen(path): try: pdfTag = ch.find_element_by_xpath("//p[@class='SideBox_action']/a[text()='Download PDF']") pdfLink = pdfTag.get_attribute('href') p1idx = pdfLink.find(';jwcn') pdfLink = pdfLink[p1idx+1:] + pdfLink[:p1idx + 1] extidx = pdfLink.find('?site=') if (extidx!=-1): pdfLink = pdfLink[:extidx] except selenium.common.exceptions.NoSuchElementException: print('Springer open link has no PDF, return trying v2...') try: pdfTag = ch.find_element_by_xpath("//p[@class='u-marginBtmM']/a[text()='Download PDF']") pdfLink = pdfTag.get_attribute('href') except selenium.common.exceptions.NoSuchElementException: print('Springer open link has no PDF, trying V2...') return None print(pdfLink) session = SessionInitializer.getSesh() r = session.get(pdfLink, stream=True) if r.status_code == 200: with open(path, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) return 1 print('ERROR: watlib pdf springeropen was not downloaded correctly') return None
def downloadIEEE(self, href, path): href.click() session = SessionInitializer.getSesh() resp = session.get(self.ch.current_url) src = BeautifulSoup(resp.content, 'lxml').text idx = src.find('"pdfUrl"') src = src[idx:] idx = src.find(':') idx2 = src.find(',') src = src[idx+1:idx2].strip().strip('"') url = 'http://ieeexplore.ieee.org.proxy.lib.uwaterloo.ca' + src print(url) resp2 = session.get(url) wrapperPage = BeautifulSoup(resp2.content, 'lxml') frames = wrapperPage.findAll('frame') srcFrame = None for frame in frames: if frame['src'] and 'http' in frame['src']: srcFrame = frame['src'] if srcFrame: return self.downloadPdfLink(srcFrame, path, 'IEEE') else: return None
def count_journal_frequency(author, num_papers): author.loadPapers(num_papers, loadPaperPDFs=False) print("Author fully loaded. Processing loaded papers...") pap_arr = [] count = 0 try: for idx, paper in enumerate(author.getPapers()[45:]): time.sleep(10) info_list = [] one_pap_arr = [] cited_by_url = paper.getCitedByUrl() session = SessionInitializer.getSesh() url_part_one = SessionInitializer.ROOT_URL + '/scholar?start=' url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites=' cited_by_url = cited_by_url[:cited_by_url.rfind('&')] paper_code = cited_by_url[cited_by_url.rfind('=') + 1:] for i in range(0, 30, 10): time.sleep(10) final_url = url_part_one + str(i) + url_part_two + paper_code print(final_url) response = session.get(final_url) soup = BeautifulSoup(response.content, "lxml") info_list += soup.findAll('div', attrs={'class': 'gs_a'}) journal_dict = {} for info_str in info_list: info_str = info_str.text info_str = info_str.split(' - ')[1].split(',')[0].replace( '…', '').strip() if info_str.isdigit(): continue #print('final info string: ' + info_str) info_str = info_str.lower().title() if (info_str in journal_dict): journal_dict[info_str] += 1 else: journal_dict[info_str] = 1 one_pap_arr.append(paper.getInfo()['Title']) one_pap_arr.append(journal_dict) print(one_pap_arr) pap_arr.append(one_pap_arr) print('Paper ' + str(idx) + ' complete.') count += 1 except KeyboardInterrupt: print('User ended program, returning journal array') print(pap_arr) return pap_arr
def count_journal_frequency (author, num_papers): author.loadPapers(num_papers, loadPaperPDFs=False) print("Author fully loaded. Processing loaded papers...") pap_arr = [] count = 0 try: for idx, paper in enumerate(author.getPapers()[45:]): time.sleep(10) info_list = [] one_pap_arr = [] cited_by_url = paper.getCitedByUrl() session = SessionInitializer.getSesh() url_part_one = SessionInitializer.ROOT_URL + '/scholar?start=' url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites=' cited_by_url = cited_by_url[:cited_by_url.rfind('&')] paper_code = cited_by_url[cited_by_url.rfind('=')+1:] for i in range(0, 30, 10): time.sleep(10) final_url = url_part_one+str(i)+url_part_two+paper_code print(final_url) response = session.get(final_url) soup = BeautifulSoup(response.content, "lxml") info_list += soup.findAll('div', attrs={'class':'gs_a'}) journal_dict = {} for info_str in info_list: info_str = info_str.text info_str = info_str.split(' - ')[1].split(',')[0].replace('…', '').strip() if info_str.isdigit(): continue #print('final info string: ' + info_str) info_str = info_str.lower().title() if (info_str in journal_dict): journal_dict[info_str]+=1 else: journal_dict[info_str] = 1 one_pap_arr.append(paper.getInfo()['Title']) one_pap_arr.append(journal_dict) print(one_pap_arr) pap_arr.append(one_pap_arr) print('Paper ' + str(idx) + ' complete.') count+=1 except KeyboardInterrupt: print('User ended program, returning journal array') print(pap_arr) return pap_arr
def downloadPdfLink(self, link, path, source): print(link) session = SessionInitializer.getSesh() r = session.get(link, stream=True) if r.status_code == 200: with open(path, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) return 1 print('Error: ' + source + ' pdf was not downloaded correctly.') return None
def downloadMdpi(path): try: pdfTag = ch.find_element_by_xpath("//li/a[text()='Full-Text PDF']") pdfLink = pdfTag.get_attribute('href') except selenium.common.exceptions.NoSuchElementException: print('MDPI link has no PDF, returning None...') return None print(pdfLink) session = SessionInitializer.getSesh() r = session.get(pdfLink, stream=True) if r.status_code == 200: with open(path, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) return 1 print('ERROR: watlib pdf MDPI was not downloaded correctly') return None
def downloadScholarPortal(href, path): href.click() try: pdfxmlTag = ch.find_element_by_xpath("//div[@class='download-btn']/a[text()='PDF Download']") pdfxmllink = pdfxmlTag.get_attribute('href') except selenium.common.exceptions.NoSuchElementException: print('Racer or invalid link only, no scholarsportal returning none...') return None print(pdfxmllink) session = SessionInitializer.getSesh() r = session.get(pdfxmllink, stream=True) if r.status_code == 200: with open(path, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) return 1 print('ERROR: watlib pdf scholarsportal was not downloaded correctly') return None
def downloadScholarPortal(href, path): href.click() try: pdfxmlTag = ch.find_element_by_xpath( "//div[@class='download-btn']/a[text()='PDF Download']") pdfxmllink = pdfxmlTag.get_attribute('href') except selenium.common.exceptions.NoSuchElementException: print( 'Racer or invalid link only, no scholarsportal returning none...') return None print(pdfxmllink) session = SessionInitializer.getSesh() r = session.get(pdfxmllink, stream=True) if r.status_code == 200: with open(path, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) return 1 print('ERROR: watlib pdf scholarsportal was not downloaded correctly') return None
def __init__(self): #Internet Session Setup self.session = SessionInitializer.getSesh() self.headers = SessionInitializer.getHeaders()
''' Created on Jan 05, 2016 @author: Ankai ''' from bs4 import BeautifulSoup import time from ReferenceParser import IeeeReferenceParser, SpringerReferenceParser, PaperReferenceExtractor, PdfObj import SessionInitializer import WatLibSeleniumParser SESSION = SessionInitializer.getSesh() class Paper: def __init__ (self, link, loadPaperPDFs=True): self.url = link self.pdfObj = None self.pap_info = {} #self.__pap_info['Publisher'] = '' self.citedByUrl = None self.citedByNum = 0 #Internet Session Setup self.loadFromScopus(loadPaperPDFs=loadPaperPDFs) def loadFromScopus(self, loadPaperPDFs=True): response = SESSION.get(self.url) soup = BeautifulSoup(response.content, 'lxml') # PDF Object if loadPaperPDFs:
''' Created on Jan 05, 2016 @author: Ankai ''' from bs4 import BeautifulSoup import time from ReferenceParser import IeeeReferenceParser, SpringerReferenceParser, PaperReferenceExtractor, PdfObj import SessionInitializer import WatLibSeleniumParser SESSION = SessionInitializer.getSesh() WATPARSER = WatLibSeleniumParser.WatLibParser() class Paper: def __init__(self, link, loadPaperPDFs=True): self.url = link self.pdfObj = None self.pap_info = {} #self.__pap_info['Publisher'] = '' self.citedByUrl = None self.citedByNum = 0 #Internet Session Setup self.loadFromScopus(loadPaperPDFs=loadPaperPDFs) def loadFromScopus(self, loadPaperPDFs=True): response = SESSION.get(self.url) soup = BeautifulSoup(response.content, 'lxml')