Esempio n. 1
0
def get_pdf(html):
    """ xxx"""
    reg = r'href="(.+?\.pdf)">pdf'
    pdfre = re.compile(reg)
    pdflist = re.findall(pdfre, html)
    dir_name = 'CVPR2015'
    maxrows = len(pdflist)
    pbar = prgbar.ProgressBar(total=maxrows)

    if os.path.exists(dir_name) is False:
        os.mkdir(dir_name)

    for idx, pdfurl in enumerate(pdflist):
        reg2 = r'papers/(.+?\.pdf)'
        pdfre2 = re.compile(reg2)
        filename = dir_name + '/' + re.findall(pdfre2, pdfurl)[0]
        pbar.log('http://www.cv-foundation.org/openaccess/' + pdfurl)
        if os.path.exists(filename) is True:
            pbar.log('Exist')
        else:
            urllib.urlretrieve(
                'http://www.cv-foundation.org/openaccess/' + pdfurl, filename)
        pbar.update(index=(idx + 1))

    pbar.finish()
Esempio n. 2
0
def get_pdf(html, keywords):
    """ xxx"""
    reg = r'href="(.+?\.pdf)">pdf'
    pdfre = re.compile(reg)
    pdflist = re.findall(pdfre, html)
    dir_name = 'CVPR2018'
    maxrows = len(pdflist)
    pbar = prgbar.ProgressBar(total=maxrows)

    if os.path.exists(dir_name) is False:
        os.mkdir(dir_name)

    for idx, pdfurl in enumerate(pdflist):
        reg2 = r'papers/(.+?\.pdf)'
        pdfre2 = re.compile(reg2)
        name_ori = re.findall(pdfre2, pdfurl)[0]
        # words list
        words_list = name_ori.split('_')
        name_list = words_list[1:-3]
        filename = dir_name + '/' + contrust_paper_name(name_list)
        print(filename)
        pbar.log('http://openaccess.thecvf.com/' + pdfurl)
        if os.path.exists(filename) is True:
            pbar.log('Exist')
        else:
            urllib.urlretrieve(
                'http://openaccess.thecvf.com/' + pdfurl, filename)
        # # ingore unconcerned papers
        # if keywords in name_list:
        #     filename = dir_name + '/' + contrust_paper_name(name_list)
        #     print(filename)
        #     pbar.log('http://openaccess.thecvf.com/' + pdfurl)
        #     if os.path.exists(filename) is True:
        #         pbar.log('Exist')
        #     else:
        #         urllib.urlretrieve(
        #             'http://openaccess.thecvf.com/' + pdfurl, filename)
        # else:
        #     print('ignore paper %s' % name_ori)
        pbar.update(index=(idx + 1))

    pbar.finish()
Esempio n. 3
0
def get_pdf(html):
    """ xxx"""
    reg = r'href="(.+?\.pdf)">pdf'
    pdfre = re.compile(reg)
    pdflist = re.findall(pdfre, html)
    dir_name = 'COLT2016'
    if os.path.exists(dir_name) is False:
        os.mkdir(dir_name)
    maxrows = len(pdflist)
    pbar = prgbar.ProgressBar(total=maxrows)
    for idx, pdfurl in enumerate(pdflist):
        filename = dir_name + '/' + pdfurl
        pbar.log('http://jmlr.org/proceedings/papers/v49/' + pdfurl)
        if os.path.exists(filename) is True:
            pbar.log('Exist')
        else:
            urllib.urlretrieve(
                'http://jmlr.org/proceedings/papers/v49/' + pdfurl, filename)
        pbar.update(index=(idx + 1))
    pbar.finish()
Esempio n. 4
0
def get_pdf(html):
    """ xxx"""
    reg = r'href="/paper/(.+?)"'
    pdfre = re.compile(reg)
    pdflist = re.findall(pdfre, html)
    dir_name = 'NIPS2014'
    maxrows = len(pdflist)
    pbar = prgbar.ProgressBar(total=maxrows)

    if os.path.exists(dir_name) is False:
        os.mkdir(dir_name)

    for idx, pdfurl in enumerate(pdflist):
        filename = dir_name + '/' + pdfurl + '.pdf'
        pbar.log('http://papers.nips.cc/paper/' + pdfurl + '.pdf')
        if os.path.exists(filename) is True:
            pbar.log('Exist')
        else:
            urllib.urlretrieve(
                'http://papers.nips.cc/paper/' + pdfurl + '.pdf', filename)
        pbar.update(index=(idx + 1))

    pbar.finish()