def get_pdf(html): """ xxx""" reg = r'href="(.+?\.pdf)">pdf' pdfre = re.compile(reg) pdflist = re.findall(pdfre, html) dir_name = 'CVPR2015' maxrows = len(pdflist) pbar = prgbar.ProgressBar(total=maxrows) if os.path.exists(dir_name) is False: os.mkdir(dir_name) for idx, pdfurl in enumerate(pdflist): reg2 = r'papers/(.+?\.pdf)' pdfre2 = re.compile(reg2) filename = dir_name + '/' + re.findall(pdfre2, pdfurl)[0] pbar.log('http://www.cv-foundation.org/openaccess/' + pdfurl) if os.path.exists(filename) is True: pbar.log('Exist') else: urllib.urlretrieve( 'http://www.cv-foundation.org/openaccess/' + pdfurl, filename) pbar.update(index=(idx + 1)) pbar.finish()
def get_pdf(html): """ xxx""" reg = r'href="(.+?\.pdf)">pdf' pdfre = re.compile(reg) pdflist = re.findall(pdfre, html) dir_name = 'COLT2016' if os.path.exists(dir_name) is False: os.mkdir(dir_name) maxrows = len(pdflist) pbar = prgbar.ProgressBar(total=maxrows) for idx, pdfurl in enumerate(pdflist): filename = dir_name + '/' + pdfurl pbar.log('http://jmlr.org/proceedings/papers/v49/' + pdfurl) if os.path.exists(filename) is True: pbar.log('Exist') else: urllib.urlretrieve( 'http://jmlr.org/proceedings/papers/v49/' + pdfurl, filename) pbar.update(index=(idx + 1)) pbar.finish()
def get_pdf(html): """ xxx""" reg = r'href="/paper/(.+?)"' pdfre = re.compile(reg) pdflist = re.findall(pdfre, html) dir_name = 'NIPS2012' maxrows = len(pdflist) pbar = prgbar.ProgressBar(total=maxrows) if os.path.exists(dir_name) is False: os.mkdir(dir_name) for idx, pdfurl in enumerate(pdflist): filename = dir_name + '/' + pdfurl + '.pdf' pbar.log('http://papers.nips.cc/paper/' + pdfurl + '.pdf') if os.path.exists(filename) is True: pbar.log('Exist') else: urllib.urlretrieve( 'http://papers.nips.cc/paper/' + pdfurl + '.pdf', filename) pbar.update(index=(idx + 1)) pbar.finish()