def download_one(recid, version): """Download given version of the PDF from arxiv""" write_message('fetching %s' % recid) for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)): if count != 0: write_message("Warning: %s has multiple arxiv #" % recid) continue url_for_pdf = build_arxiv_url(arxiv_id, version) filename_arxiv_id = arxiv_id.replace('/', '_') temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id) write_message('downloading pdf from %s' % url_for_pdf) path = download_external_url(url_for_pdf, temp_file.name, content_type='pdf') # Check if it is not an html not found page filesize = os.path.getsize(path) if filesize < 25000: f = open(path) try: for line in f: if 'PDF unavailable' in line: raise PdfNotAvailable() finally: f.close() docs = BibRecDocs(recid) bibdocfiles = docs.list_latest_files(doctype="arXiv") needs_update = False try: bibdocfile = bibdocfiles[0] except IndexError: bibdocfile = None needs_update = True else: existing_md5 = calculate_md5(bibdocfile.fullpath) new_md5 = calculate_md5(path.encode('utf-8')) if new_md5 != existing_md5: write_message('md5 differs updating') needs_update = True else: write_message('md5 matches existing pdf, skipping') if needs_update: if bibdocfiles: write_message('adding as new version') docs.add_new_version(path, docname=bibdocfile.name) else: write_message('adding as new file') docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id) else: raise FoundExistingPdf()
def test_md5_algorithms(self): """bibdocfile - compare md5 algorithms""" from invenio.bibdocfile import calculate_md5, \ calculate_md5_external filepath = os.path.join(self.path, 'test.txt') open(filepath, "w").write("test") self.assertEqual(calculate_md5(filepath, force_internal=True), calculate_md5_external(filepath))
def test_posting_file(self): """webstyle - direct posting of a file""" path = os.path.join(CFG_PREFIX, 'lib', 'webtest', 'invenio', 'test.gif') body = open(path).read() md5 = calculate_md5(path) mimetype = 'image/gif' connection = httplib.HTTPConnection(urlparse.urlsplit(CFG_SITE_URL)[1]) connection.request('POST', '/httptest/post2', body, {'Content-MD5': md5, 'Content-Type': mimetype, 'Content-Disposition': 'filename=test.gif'}) response = connection.getresponse() body2 = response.read() self.assertEqual(body, body2, "Body sent differs from body received")
def bst_dump_records(): try: os.makedirs(os.path.join(CFG_WEBDIR, 'dumps')) except OSError: pass html_index = open(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'), "w") print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>" for collection in CFG_EXPORTED_COLLECTIONS: task_update_progress(collection) print >> html_index, """ <li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a> (<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % { 'prefix': CFG_SITE_URL, 'collection': collection, 'date': time.ctime() } write_message("Preparing %s-records.xml.gz" % collection) output_path = os.path.join(CFG_WEBDIR, 'dumps', '.%s-records.xml.gz' % collection) output = gzip.open(output_path, "w") print >> output, "<collection>" reclist = get_collection_reclist(collection) tot = len(reclist) time_estimator = get_time_estimator(tot) for i, recid in enumerate(reclist): with run_ro_on_slave_db(): print >> output, format_record(recid, 'xme', user_info={})[0] time_estimation = time_estimator()[1] if (i + 1) % 100 == 0: task_update_progress( "%s %s (%s%%) -> %s" % (collection, recid, (i + 1) * 100 / tot, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)))) task_sleep_now_if_required() print >> output, "</collection>" output.close() write_message("Computing checksum") print >> open(output_path + '.md5', "w"), calculate_md5(output_path) os.rename( output_path, os.path.join(CFG_WEBDIR, 'dumps', '%s-records.xml.gz' % collection)) os.rename( output_path + '.md5', os.path.join(CFG_WEBDIR, 'dumps', '%s-records.xml.gz.md5' % collection)) write_message("DONE") print >> html_index, "</ul></body></html>" html_index.close() os.rename(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'), os.path.join(CFG_WEBDIR, 'dumps', 'inspire-dump.html'))
def bst_dump_records(): try: os.makedirs(os.path.join(CFG_WEBDIR, "dumps")) except OSError: pass html_index = open(os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), "w") print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>" for collection in CFG_EXPORTED_COLLECTIONS: task_update_progress(collection) print >> html_index, """ <li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a> (<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % { "prefix": CFG_SITE_URL, "collection": collection, "date": time.ctime(), } write_message("Preparing %s-records.xml.gz" % collection) output_path = os.path.join(CFG_WEBDIR, "dumps", ".%s-records.xml.gz" % collection) output = gzip.open(output_path, "w") print >> output, "<collection>" reclist = get_collection_reclist(collection) tot = len(reclist) time_estimator = get_time_estimator(tot) for i, recid in enumerate(reclist): print >> output, format_record(recid, "xme", user_info={})[0] time_estimation = time_estimator()[1] if (i + 1) % 100 == 0: task_update_progress( "%s %s (%s%%) -> %s" % ( collection, recid, (i + 1) * 100 / tot, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)), ) ) task_sleep_now_if_required() print >> output, "</collection>" output.close() write_message("Computing checksum") print >>open(output_path + ".md5", "w"), calculate_md5(output_path) os.rename(output_path, os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz" % collection)) os.rename(output_path + ".md5", os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz.md5" % collection)) write_message("DONE") print >> html_index, "</ul></body></html>" html_index.close() os.rename( os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), os.path.join(CFG_WEBDIR, "dumps", "inspire-dump.html") )
def test_posting_file(self): """webstyle - direct posting of a file""" path = os.path.join(CFG_PREFIX, "lib", "webtest", "invenio", "test.gif") body = open(path).read() md5 = calculate_md5(path) mimetype = "image/gif" connection = httplib.HTTPConnection(urlparse.urlsplit(CFG_SITE_URL)[1]) connection.request( "POST", "/httptest/post2", body, {"Content-MD5": md5, "Content-Type": mimetype, "Content-Disposition": "filename=test.gif"}, ) response = connection.getresponse() body2 = response.read() self.assertEqual(body, body2, "Body sent differs from body received")
def test_md5_algorithms(self): """bibdocfile - compare md5 algorithms""" filepath = os.path.join(self.path, "test.txt") open(filepath, "w").write("test") self.assertEqual(calculate_md5(filepath, force_internal=True), calculate_md5_external(filepath))