def iclr_bib_2015_2016(year): assert (year == 2015 or year == 2016) # 15 first workshop: first_workshop_15 = 'Learning Non-deterministic Representations with Energy-based Ensembles' link = ('https://iclr.cc/archive/www/doku.php%3Fid=iclr' + str(year) + ':accepted-main.html') conf = 'ICLR' html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = '' for div in soup.select('li.level1 div'): title = div.a.get_text() pdflink = div.a['href'] div.a.decompose() authors_str = div.get_text() authors = authors_str2lst(authors_str) if authors: # change title to workshop if year == 2015 and first_workshop_15 in title: conf = 'ICLRWorkshop' id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def iclr_bib_2014(): link = 'https://iclr.cc/archive/2014/conference-proceedings/' year = 2014 conf = 'ICLR' html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') # all blocks ps = soup.select('#sites-canvas-main-content p') # remove the empty blocks ps = [p for p in ps if p.get_text().strip()] # remove the conference title: Listed below are the conference papers .. ps = ps[1:] # titles ptitles = [p for p in ps if p.find('a')] # authors pauthors = [p for p in ps if not p.find('a')] [p.get_text().strip() for p in pauthors] [p.get_text().strip() for p in ptitles] res = '' for ptitle, pauthor in zip(ptitles, pauthors): title = ptitle.get_text().strip() authors_str = pauthor.get_text().strip() authors = authors_str2lst(authors_str) # actually arxiv pdflink = ptitle.find('a')['href'] id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def ieee_index_page(punumber): """Return a list [title] and [link] for each conference year. The year,link pair should be determined outside. """ # if not exist, download to a tmp file # https://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=1000639 # parse the file # get the list of punumber for each year, and return url = 'https://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=' + str( punumber) html_file = download_to_hash(url) if not os.path.exists(html_file): download_file(url, html_file) soup = BeautifulSoup(open(html_file), 'lxml') texts = [] links = [] for a in soup.select('.detail li a'): text = a.get_text().strip() link = a['href'] if not link.startswith('http'): link = 'https://ieeexplore.ieee.org/xpl/' + link texts.append(text) links.append(link) return list(reversed(texts)), list(reversed(links))
def springer_get_volumes(link): """Return [volume_link], including this one. """ html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = [link] for a in soup.select('.other-volumes-container li a'): href = a['href'] if not href.startswith('http'): href = 'https://link.springer.com/' + href res.append(href) return res
def springer_get_pagination(link): """Return a list of links for each pagination """ html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') if soup.select('.test-maxpagenum'): pagenum = int(soup.select('.test-maxpagenum')[0].get_text()) else: pagenum = 1 res = [] for p in range(pagenum): res.append(link + '?page=' + str(p+1)) return res
def springer_parse_index(url): """Return titles, links """ html_file = download_to_hash(url) soup = BeautifulSoup(open(html_file), 'lxml') titles = [] links = [] for a in soup.select('.c-card a'): link = a['href'] if not link.startswith('http'): link = 'https://link.springer.com/' + link title = a.get_text().strip() titles.append(title) links.append(link) return titles, links
def ieee_conference_get_pagination(year, conf, link): """Return [link] for all pagination of this conference """ html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = [] # if have pagination hidden = soup.select('input#oqs')[0]['value'] text = soup.select('.results-display')[0].get_text() # Displaying Results 801 - 824 of 824 total = int(re.findall(r'Displaying Results .* of (\d+)', text)[0]) # rowsPerPage=100 pagenum = math.ceil(total / 100) for page in range(0, pagenum): # https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8449910&filter=issueId%20EQ%20%228460178%22&pageNumber=2 link += ('&' + hidden + '&rowsPerPage=100' + '&pageNumber=' + str(page + 1)) res.append(link) return res
def acl_conference_bib(year, conf, link): # link = 'https://aclanthology.info/events/acl-2018' # year = 2018 # conf = 'ACL' html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = '' # soup.select('#content p')[3].select('a[href^=/people]') # len(soup.select('#content p')) for p in soup.select('#content p'): strong = p.strong title = strong.a.get_text() authors = [a.get_text() for a in p.select('a[href^=/people]')] if authors: pdflink = p.a['href'] id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def ieee_journal_index_page(punumber): """Return a {year: [links]} """ # https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=8860 # punumber=8860 url = 'https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=' + str( punumber) html_file = download_to_hash(url) soup = BeautifulSoup(open(html_file), 'lxml') volumes = {} for ul in soup.select('.volumes ul'): # pi-2012 year = int(ul['id'].split('-')[1]) if not year in volumes: volumes[year] = [] for a in ul.select('li a'): link = a['href'] if not link.startswith('http'): link = 'https://ieeexplore.ieee.org/' + link volumes[year].append(link) return volumes
def nips_bib(year): ID = year - 1988 + 1 conf = 'NIPS' link = 'https://papers.nips.cc/book/advances-in-neural-information-processing-systems-' + str( ID) + '-' + str(year) html_file = download_to_hash(link) NIPS_pdf_prefix = 'https://papers.nips.cc/' res = '\n' with open(html_file) as f: soup = BeautifulSoup(f, 'html.parser') for li in soup.select("div.main")[0].ul.find_all('li'): href = li.a['href'] title = li.a.string authors = list( map(lambda author: author.string, li.select('.author'))) pdflink = NIPS_pdf_prefix + href + '.pdf' if title and authors: id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def ieee_conference_bib(year, conf, link): html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = '' for div in soup.select('.txt'): if div.h3.a and div.select('.authors a'): for f in div.select('formula'): f.decompose() title = div.h3.get_text() # "/document/8461262/" arnumber = re.findall(r'/(\d+)/', div.h3.a['href'])[0] pdflink = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=' + arnumber authors = [ author.get_text().strip() for author in div.select('.authors a') ] # print(authors) id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def springer_bib(year, conf, link): """Return bib for this page only. """ html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = '' for paper in soup.select('.chapter-item'): meta = paper.select('.content-type-list__meta')[0] title = meta.select('div')[0].get_text() authors_str = meta.select('div')[1].get_text() authors = authors_str2lst(authors_str) pdflink_a = paper.select('a.test-book-toc-download-link') pdflink = '' # some conference may not have a pdflink, e.g. # https://link.springer.com//book/10.1007/BFb0015518 if pdflink_a: pdflink = pdflink_a[0]['href'] if not pdflink.startswith('http'): pdflink = 'https://link.springer.com/' + pdflink id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res