Esempio n. 1
0
def iclr_bib_2015_2016(year):
    assert (year == 2015 or year == 2016)
    # 15 first workshop:
    first_workshop_15 = 'Learning Non-deterministic Representations with Energy-based Ensembles'
    link = ('https://iclr.cc/archive/www/doku.php%3Fid=iclr' + str(year) +
            ':accepted-main.html')
    conf = 'ICLR'
    html_file = download_to_hash(link)
    soup = BeautifulSoup(open(html_file), 'lxml')
    res = ''
    for div in soup.select('li.level1 div'):
        title = div.a.get_text()
        pdflink = div.a['href']
        div.a.decompose()
        authors_str = div.get_text()
        authors = authors_str2lst(authors_str)
        if authors:
            # change title to workshop
            if year == 2015 and first_workshop_15 in title:
                conf = 'ICLRWorkshop'
            id = gen_id(year, conf, authors, title)
            bib = gen_single_bib(id, title, ' and '.join(authors), pdflink,
                                 year, conf)
            res += bib
    return res
Esempio n. 2
0
def iclr_bib_2014():
    link = 'https://iclr.cc/archive/2014/conference-proceedings/'
    year = 2014
    conf = 'ICLR'
    html_file = download_to_hash(link)
    soup = BeautifulSoup(open(html_file), 'lxml')
    # all blocks
    ps = soup.select('#sites-canvas-main-content p')
    # remove the empty blocks
    ps = [p for p in ps if p.get_text().strip()]
    # remove the conference title: Listed below are the conference papers ..
    ps = ps[1:]
    # titles
    ptitles = [p for p in ps if p.find('a')]
    # authors
    pauthors = [p for p in ps if not p.find('a')]
    [p.get_text().strip() for p in pauthors]
    [p.get_text().strip() for p in ptitles]
    res = ''
    for ptitle, pauthor in zip(ptitles, pauthors):
        title = ptitle.get_text().strip()
        authors_str = pauthor.get_text().strip()
        authors = authors_str2lst(authors_str)
        # actually arxiv
        pdflink = ptitle.find('a')['href']
        id = gen_id(year, conf, authors, title)
        bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year,
                             conf)
        res += bib
    return res
Esempio n. 3
0
def ieee_index_page(punumber):
    """Return a list [title] and [link] for each conference year. The
    year,link pair should be determined outside.

    """
    # if not exist, download to a tmp file
    # https://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=1000639
    # parse the file
    # get the list of punumber for each year, and return
    url = 'https://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=' + str(
        punumber)
    html_file = download_to_hash(url)
    if not os.path.exists(html_file):
        download_file(url, html_file)
    soup = BeautifulSoup(open(html_file), 'lxml')
    texts = []
    links = []
    for a in soup.select('.detail li a'):
        text = a.get_text().strip()
        link = a['href']
        if not link.startswith('http'):
            link = 'https://ieeexplore.ieee.org/xpl/' + link
        texts.append(text)
        links.append(link)
    return list(reversed(texts)), list(reversed(links))
Esempio n. 4
0
def springer_get_volumes(link):
    """Return [volume_link], including this one.
    """
    html_file = download_to_hash(link)
    soup = BeautifulSoup(open(html_file), 'lxml')
    res = [link]
    for a in soup.select('.other-volumes-container li a'):
        href = a['href']
        if not href.startswith('http'):
            href = 'https://link.springer.com/' + href
        res.append(href)
    return res
Esempio n. 5
0
def springer_get_pagination(link):
    """Return a list of links for each pagination
    """
    html_file = download_to_hash(link)
    soup = BeautifulSoup(open(html_file), 'lxml')
    if soup.select('.test-maxpagenum'):
        pagenum = int(soup.select('.test-maxpagenum')[0].get_text())
    else:
        pagenum = 1
    res = []
    for p in range(pagenum):
        res.append(link + '?page=' + str(p+1))
    return res
Esempio n. 6
0
def springer_parse_index(url):
    """Return titles, links
    """
    html_file = download_to_hash(url)
    soup = BeautifulSoup(open(html_file), 'lxml')
    titles = []
    links = []
    for a in soup.select('.c-card a'):
        link = a['href']
        if not link.startswith('http'):
            link = 'https://link.springer.com/' + link
        title = a.get_text().strip()
        titles.append(title)
        links.append(link)
    return titles, links
Esempio n. 7
0
def ieee_conference_get_pagination(year, conf, link):
    """Return [link] for all pagination of this conference
    """
    html_file = download_to_hash(link)
    soup = BeautifulSoup(open(html_file), 'lxml')
    res = []
    # if have pagination
    hidden = soup.select('input#oqs')[0]['value']
    text = soup.select('.results-display')[0].get_text()
    # Displaying Results 801 - 824 of 824
    total = int(re.findall(r'Displaying Results .* of (\d+)', text)[0])
    # rowsPerPage=100
    pagenum = math.ceil(total / 100)
    for page in range(0, pagenum):
        # https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8449910&filter=issueId%20EQ%20%228460178%22&pageNumber=2
        link += ('&' + hidden + '&rowsPerPage=100' + '&pageNumber=' +
                 str(page + 1))
        res.append(link)
    return res
Esempio n. 8
0
def acl_conference_bib(year, conf, link):
    # link = 'https://aclanthology.info/events/acl-2018'
    # year = 2018
    # conf = 'ACL'
    html_file = download_to_hash(link)
    soup = BeautifulSoup(open(html_file), 'lxml')
    res = ''
    # soup.select('#content p')[3].select('a[href^=/people]')

    # len(soup.select('#content p'))
    for p in soup.select('#content p'):
        strong = p.strong
        title = strong.a.get_text()
        authors = [a.get_text() for a in p.select('a[href^=/people]')]
        if authors:
            pdflink = p.a['href']
            id = gen_id(year, conf, authors, title)
            bib = gen_single_bib(id, title, ' and '.join(authors), pdflink,
                                 year, conf)
            res += bib
    return res
Esempio n. 9
0
def ieee_journal_index_page(punumber):
    """Return a {year: [links]}
    """
    # https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=8860
    # punumber=8860
    url = 'https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=' + str(
        punumber)
    html_file = download_to_hash(url)
    soup = BeautifulSoup(open(html_file), 'lxml')
    volumes = {}
    for ul in soup.select('.volumes ul'):
        # pi-2012
        year = int(ul['id'].split('-')[1])
        if not year in volumes:
            volumes[year] = []
        for a in ul.select('li a'):
            link = a['href']
            if not link.startswith('http'):
                link = 'https://ieeexplore.ieee.org/' + link
            volumes[year].append(link)
    return volumes
Esempio n. 10
0
def nips_bib(year):
    ID = year - 1988 + 1
    conf = 'NIPS'
    link = 'https://papers.nips.cc/book/advances-in-neural-information-processing-systems-' + str(
        ID) + '-' + str(year)
    html_file = download_to_hash(link)
    NIPS_pdf_prefix = 'https://papers.nips.cc/'
    res = '\n'
    with open(html_file) as f:
        soup = BeautifulSoup(f, 'html.parser')
        for li in soup.select("div.main")[0].ul.find_all('li'):
            href = li.a['href']
            title = li.a.string
            authors = list(
                map(lambda author: author.string, li.select('.author')))
            pdflink = NIPS_pdf_prefix + href + '.pdf'
            if title and authors:
                id = gen_id(year, conf, authors, title)
                bib = gen_single_bib(id, title, ' and '.join(authors), pdflink,
                                     year, conf)
                res += bib
    return res
Esempio n. 11
0
def ieee_conference_bib(year, conf, link):
    html_file = download_to_hash(link)
    soup = BeautifulSoup(open(html_file), 'lxml')
    res = ''
    for div in soup.select('.txt'):
        if div.h3.a and div.select('.authors a'):
            for f in div.select('formula'):
                f.decompose()
            title = div.h3.get_text()
            # "/document/8461262/"
            arnumber = re.findall(r'/(\d+)/', div.h3.a['href'])[0]
            pdflink = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=' + arnumber
            authors = [
                author.get_text().strip()
                for author in div.select('.authors a')
            ]
            # print(authors)
            id = gen_id(year, conf, authors, title)
            bib = gen_single_bib(id, title, ' and '.join(authors), pdflink,
                                 year, conf)
            res += bib
    return res
Esempio n. 12
0
def springer_bib(year, conf, link):
    """Return bib for this page only.
    """
    html_file = download_to_hash(link)
    soup = BeautifulSoup(open(html_file), 'lxml')
    res = ''
    for paper in soup.select('.chapter-item'):
        meta = paper.select('.content-type-list__meta')[0]
        title = meta.select('div')[0].get_text()
        authors_str = meta.select('div')[1].get_text()
        authors = authors_str2lst(authors_str)
        pdflink_a = paper.select('a.test-book-toc-download-link')
        pdflink = ''
        # some conference may not have a pdflink, e.g.
        # https://link.springer.com//book/10.1007/BFb0015518
        if pdflink_a:
            pdflink = pdflink_a[0]['href']
        if not pdflink.startswith('http'):
            pdflink = 'https://link.springer.com/' + pdflink
        id = gen_id(year, conf, authors, title)
        bib = gen_single_bib(id, title, ' and '.join(authors), pdflink, year, conf)
        res += bib
    return res