Ejemplo n.º 1
0
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message('fetching %s' % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace('/', '_')
        temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker",
                                       dir=CFG_TMPSHAREDDIR,
                                       suffix="%s.pdf" % filename_arxiv_id)
        write_message('downloading pdf from %s' % url_for_pdf)
        path = download_external_url(url_for_pdf,
                                     temp_file.name,
                                     content_type='pdf')

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if 'PDF unavailable' in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode('utf-8'))
            if new_md5 != existing_md5:
                write_message('md5 differs updating')
                needs_update = True
            else:
                write_message('md5 matches existing pdf, skipping')

        if needs_update:
            if bibdocfiles:
                write_message('adding as new version')
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message('adding as new file')
                docs.add_new_file(path,
                                  doctype="arXiv",
                                  docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
Ejemplo n.º 2
0
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message('fetching %s' % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace('/', '_')
        temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker",
                                       dir=CFG_TMPSHAREDDIR,
                                       suffix="%s.pdf" % filename_arxiv_id)
        write_message('downloading pdf from %s' % url_for_pdf)
        path = download_external_url(url_for_pdf,
                                     temp_file.name,
                                     content_type='pdf')

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if 'PDF unavailable' in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode('utf-8'))
            if new_md5 != existing_md5:
                write_message('md5 differs updating')
                needs_update = True
            else:
                write_message('md5 matches existing pdf, skipping')

        if needs_update:
            if bibdocfiles:
                write_message('adding as new version')
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message('adding as new file')
                docs.add_new_file(path,
                                  doctype="arXiv",
                                  docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
Ejemplo n.º 3
0
 def test_md5_algorithms(self):
     """bibdocfile - compare md5 algorithms"""
     from invenio.bibdocfile import calculate_md5, \
         calculate_md5_external
     filepath = os.path.join(self.path, 'test.txt')
     open(filepath, "w").write("test")
     self.assertEqual(calculate_md5(filepath, force_internal=True),
                      calculate_md5_external(filepath))
Ejemplo n.º 4
0
 def test_posting_file(self):
     """webstyle - direct posting of a file"""
     path = os.path.join(CFG_PREFIX, 'lib', 'webtest', 'invenio', 'test.gif')
     body = open(path).read()
     md5 = calculate_md5(path)
     mimetype = 'image/gif'
     connection = httplib.HTTPConnection(urlparse.urlsplit(CFG_SITE_URL)[1])
     connection.request('POST', '/httptest/post2', body, {'Content-MD5': md5, 'Content-Type': mimetype, 'Content-Disposition': 'filename=test.gif'})
     response = connection.getresponse()
     body2 = response.read()
     self.assertEqual(body, body2, "Body sent differs from body received")
Ejemplo n.º 5
0
def bst_dump_records():
    try:
        os.makedirs(os.path.join(CFG_WEBDIR, 'dumps'))
    except OSError:
        pass
    html_index = open(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
                      "w")
    print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>"
    for collection in CFG_EXPORTED_COLLECTIONS:
        task_update_progress(collection)
        print >> html_index, """
<li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a>
(<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % {
            'prefix': CFG_SITE_URL,
            'collection': collection,
            'date': time.ctime()
        }
        write_message("Preparing %s-records.xml.gz" % collection)
        output_path = os.path.join(CFG_WEBDIR, 'dumps',
                                   '.%s-records.xml.gz' % collection)
        output = gzip.open(output_path, "w")
        print >> output, "<collection>"
        reclist = get_collection_reclist(collection)
        tot = len(reclist)
        time_estimator = get_time_estimator(tot)
        for i, recid in enumerate(reclist):
            with run_ro_on_slave_db():
                print >> output, format_record(recid, 'xme', user_info={})[0]
            time_estimation = time_estimator()[1]
            if (i + 1) % 100 == 0:
                task_update_progress(
                    "%s %s (%s%%) -> %s" %
                    (collection, recid, (i + 1) * 100 / tot,
                     time.strftime("%Y-%m-%d %H:%M:%S",
                                   time.localtime(time_estimation))))
                task_sleep_now_if_required()
        print >> output, "</collection>"
        output.close()
        write_message("Computing checksum")
        print >> open(output_path + '.md5', "w"), calculate_md5(output_path)
        os.rename(
            output_path,
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz' % collection))
        os.rename(
            output_path + '.md5',
            os.path.join(CFG_WEBDIR, 'dumps',
                         '%s-records.xml.gz.md5' % collection))
        write_message("DONE")
    print >> html_index, "</ul></body></html>"
    html_index.close()
    os.rename(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'),
              os.path.join(CFG_WEBDIR, 'dumps', 'inspire-dump.html'))
Ejemplo n.º 6
0
def bst_dump_records():
    try:
        os.makedirs(os.path.join(CFG_WEBDIR, "dumps"))
    except OSError:
        pass
    html_index = open(os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), "w")
    print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>"
    for collection in CFG_EXPORTED_COLLECTIONS:
        task_update_progress(collection)
        print >> html_index, """
<li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a>
(<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % {
            "prefix": CFG_SITE_URL,
            "collection": collection,
            "date": time.ctime(),
        }
        write_message("Preparing %s-records.xml.gz" % collection)
        output_path = os.path.join(CFG_WEBDIR, "dumps", ".%s-records.xml.gz" % collection)
        output = gzip.open(output_path, "w")
        print >> output, "<collection>"
        reclist = get_collection_reclist(collection)
        tot = len(reclist)
        time_estimator = get_time_estimator(tot)
        for i, recid in enumerate(reclist):
            print >> output, format_record(recid, "xme", user_info={})[0]
            time_estimation = time_estimator()[1]
            if (i + 1) % 100 == 0:
                task_update_progress(
                    "%s %s (%s%%) -> %s"
                    % (
                        collection,
                        recid,
                        (i + 1) * 100 / tot,
                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)),
                    )
                )
                task_sleep_now_if_required()
        print >> output, "</collection>"
        output.close()
        write_message("Computing checksum")
        print >>open(output_path + ".md5", "w"), calculate_md5(output_path)
        os.rename(output_path, os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz" % collection))
        os.rename(output_path + ".md5", os.path.join(CFG_WEBDIR, "dumps", "%s-records.xml.gz.md5" % collection))
        write_message("DONE")
    print >> html_index, "</ul></body></html>"
    html_index.close()
    os.rename(
        os.path.join(CFG_WEBDIR, "dumps", ".inspire-dump.html"), os.path.join(CFG_WEBDIR, "dumps", "inspire-dump.html")
    )
 def test_posting_file(self):
     """webstyle - direct posting of a file"""
     path = os.path.join(CFG_PREFIX, "lib", "webtest", "invenio", "test.gif")
     body = open(path).read()
     md5 = calculate_md5(path)
     mimetype = "image/gif"
     connection = httplib.HTTPConnection(urlparse.urlsplit(CFG_SITE_URL)[1])
     connection.request(
         "POST",
         "/httptest/post2",
         body,
         {"Content-MD5": md5, "Content-Type": mimetype, "Content-Disposition": "filename=test.gif"},
     )
     response = connection.getresponse()
     body2 = response.read()
     self.assertEqual(body, body2, "Body sent differs from body received")
Ejemplo n.º 8
0
 def test_md5_algorithms(self):
     """bibdocfile - compare md5 algorithms"""
     filepath = os.path.join(self.path, "test.txt")
     open(filepath, "w").write("test")
     self.assertEqual(calculate_md5(filepath, force_internal=True), calculate_md5_external(filepath))