def test_download_external_url(self): from invenio.filedownloadutils import (download_external_url, InvenioFileDownloadError) temp_fd, temp_path = mkstemp() os.close(temp_fd) try: try: download_external_url(EXAMPLE_PDF_URL_1, temp_path, content_type='pdf') except InvenioFileDownloadError, e: self.fail(str(e)) finally: os.unlink(temp_path)
def test_download_external_url_invalid_content_type(self): from invenio.filedownloadutils import (download_external_url, InvenioFileDownloadError) from invenio.config import CFG_SITE_URL temp_fd, temp_path = mkstemp() os.close(temp_fd) try: try: download_external_url(CFG_SITE_URL, temp_path, content_type='pdf') self.fail() except InvenioFileDownloadError: pass finally: os.unlink(temp_path)
def download_one(recid, version): """Download given version of the PDF from arxiv""" write_message('fetching %s' % recid) for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)): if count != 0: write_message("Warning: %s has multiple arxiv #" % recid) continue url_for_pdf = build_arxiv_url(arxiv_id, version) filename_arxiv_id = arxiv_id.replace('/', '_') temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id) write_message('downloading pdf from %s' % url_for_pdf) path = download_external_url(url_for_pdf, temp_file.name, content_type='pdf') # Check if it is not an html not found page filesize = os.path.getsize(path) if filesize < 25000: f = open(path) try: for line in f: if 'PDF unavailable' in line: raise PdfNotAvailable() finally: f.close() docs = BibRecDocs(recid) bibdocfiles = docs.list_latest_files(doctype="arXiv") needs_update = False try: bibdocfile = bibdocfiles[0] except IndexError: bibdocfile = None needs_update = True else: existing_md5 = calculate_md5(bibdocfile.fullpath) new_md5 = calculate_md5(path.encode('utf-8')) if new_md5 != existing_md5: write_message('md5 differs updating') needs_update = True else: write_message('md5 matches existing pdf, skipping') if needs_update: if bibdocfiles: write_message('adding as new version') docs.add_new_version(path, docname=bibdocfile.name) else: write_message('adding as new file') docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id) else: raise FoundExistingPdf()