def open_review_bib_iclr_2013_2017(conf, year, localfile): # need to download manually first_reject_13 = 'Heteroscedastic Conditional Ordinal Random' first_reject_17 = 'Energy-Based Spherical Sparse Coding' first_workshop_13 = 'Why Size Matters: Feature Coding as Nystrom Sampling' first_workshop_17 = 'Learning Continuous Semantic Representations' assert (year in [2013, 2017]) if year == 2013: first_reject = first_reject_13 first_workshop = first_workshop_13 else: first_reject = first_reject_17 first_workshop = first_workshop_17 soup = BeautifulSoup(open(localfile), 'lxml') res = '' for paper in soup.select('#notes .note.panel'): title = paper.select('h2')[0].get_text().strip() pdflink = paper.select('a.note_content_pdf')[0]['href'] if first_reject in title: break if first_workshop in title: conf = 'ICLRWorkshop' if not pdflink.startswith('http'): pdflink = 'https://openreview.net/' + pdflink authors_str = paper.select('.meta_row')[0].get_text() authors = authors_str2lst(authors_str) id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def iclr_bib_2015_2016(year): assert (year == 2015 or year == 2016) # 15 first workshop: first_workshop_15 = 'Learning Non-deterministic Representations with Energy-based Ensembles' link = ('https://iclr.cc/archive/www/doku.php%3Fid=iclr' + str(year) + ':accepted-main.html') conf = 'ICLR' html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = '' for div in soup.select('li.level1 div'): title = div.a.get_text() pdflink = div.a['href'] div.a.decompose() authors_str = div.get_text() authors = authors_str2lst(authors_str) if authors: # change title to workshop if year == 2015 and first_workshop_15 in title: conf = 'ICLRWorkshop' id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def iclr_bib_2014(): link = 'https://iclr.cc/archive/2014/conference-proceedings/' year = 2014 conf = 'ICLR' html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') # all blocks ps = soup.select('#sites-canvas-main-content p') # remove the empty blocks ps = [p for p in ps if p.get_text().strip()] # remove the conference title: Listed below are the conference papers .. ps = ps[1:] # titles ptitles = [p for p in ps if p.find('a')] # authors pauthors = [p for p in ps if not p.find('a')] [p.get_text().strip() for p in pauthors] [p.get_text().strip() for p in ptitles] res = '' for ptitle, pauthor in zip(ptitles, pauthors): title = ptitle.get_text().strip() authors_str = pauthor.get_text().strip() authors = authors_str2lst(authors_str) # actually arxiv pdflink = ptitle.find('a')['href'] id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def open_review_bib_iclr_2018_2019(year, localfile): # this is different from others .. # year = 2018 soup = BeautifulSoup(open(localfile), 'lxml') res = '' sections = [ '#accepted-oral-papers li.note', '#accepted-poster-papers li.note', '#workshop-papers li.note' ] confs = ['ICLR', 'ICLR', 'ICLRWorkshop'] for section, conf in zip(sections, confs): for paper in soup.select(section): title = paper.h4.a.get_text().strip() # there is one error in 2018 if title == 'No Title': continue pdflink = paper.select('.pdf-link')[0]['href'] if not pdflink.startswith('http'): pdflink = 'https://openreview.net/' + pdflink authors_str = paper.select('.note-authors')[0].get_text() authors = authors_str2lst(authors_str) id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def acl_conference_bib(year, conf, link): # link = 'https://aclanthology.info/events/acl-2018' # year = 2018 # conf = 'ACL' html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = '' # soup.select('#content p')[3].select('a[href^=/people]') # len(soup.select('#content p')) for p in soup.select('#content p'): strong = p.strong title = strong.a.get_text() authors = [a.get_text() for a in p.select('a[href^=/people]')] if authors: pdflink = p.a['href'] id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def nips_bib(year): ID = year - 1988 + 1 conf = 'NIPS' link = 'https://papers.nips.cc/book/advances-in-neural-information-processing-systems-' + str( ID) + '-' + str(year) html_file = download_to_hash(link) NIPS_pdf_prefix = 'https://papers.nips.cc/' res = '\n' with open(html_file) as f: soup = BeautifulSoup(f, 'html.parser') for li in soup.select("div.main")[0].ul.find_all('li'): href = li.a['href'] title = li.a.string authors = list( map(lambda author: author.string, li.select('.author'))) pdflink = NIPS_pdf_prefix + href + '.pdf' if title and authors: id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def ieee_conference_bib(year, conf, link): html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = '' for div in soup.select('.txt'): if div.h3.a and div.select('.authors a'): for f in div.select('formula'): f.decompose() title = div.h3.get_text() # "/document/8461262/" arnumber = re.findall(r'/(\d+)/', div.h3.a['href'])[0] pdflink = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=' + arnumber authors = [ author.get_text().strip() for author in div.select('.authors a') ] # print(authors) id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res
def springer_bib(year, conf, link): """Return bib for this page only. """ html_file = download_to_hash(link) soup = BeautifulSoup(open(html_file), 'lxml') res = '' for paper in soup.select('.chapter-item'): meta = paper.select('.content-type-list__meta')[0] title = meta.select('div')[0].get_text() authors_str = meta.select('div')[1].get_text() authors = authors_str2lst(authors_str) pdflink_a = paper.select('a.test-book-toc-download-link') pdflink = '' # some conference may not have a pdflink, e.g. # https://link.springer.com//book/10.1007/BFb0015518 if pdflink_a: pdflink = pdflink_a[0]['href'] if not pdflink.startswith('http'): pdflink = 'https://link.springer.com/' + pdflink id = gen_id(year, conf, authors, title) bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf) res += bib return res