def test_download_external_url(self):
        from invenio.filedownloadutils import (download_external_url,
                                               InvenioFileDownloadError)

        temp_fd, temp_path = mkstemp()
        os.close(temp_fd)
        try:
            try:
                download_external_url(EXAMPLE_PDF_URL_1,
                                      temp_path,
                                      content_type='pdf')
            except InvenioFileDownloadError, e:
                self.fail(str(e))
        finally:
            os.unlink(temp_path)
    def test_download_external_url(self):
        from invenio.filedownloadutils import (download_external_url,
                                               InvenioFileDownloadError)

        temp_fd, temp_path = mkstemp()
        os.close(temp_fd)
        try:
            try:
                download_external_url(EXAMPLE_PDF_URL_1,
                                      temp_path,
                                      content_type='pdf')
            except InvenioFileDownloadError, e:
                self.fail(str(e))
        finally:
            os.unlink(temp_path)
 def test_download_external_url_invalid_content_type(self):
     from invenio.filedownloadutils import (download_external_url,
                                            InvenioFileDownloadError)
     from invenio.config import CFG_SITE_URL
     temp_fd, temp_path = mkstemp()
     os.close(temp_fd)
     try:
         try:
             download_external_url(CFG_SITE_URL,
                                   temp_path,
                                   content_type='pdf')
             self.fail()
         except InvenioFileDownloadError:
             pass
     finally:
         os.unlink(temp_path)
 def test_download_external_url_invalid_content_type(self):
     from invenio.filedownloadutils import (download_external_url,
                                            InvenioFileDownloadError)
     from invenio.config import CFG_SITE_URL
     temp_fd, temp_path = mkstemp()
     os.close(temp_fd)
     try:
         try:
             download_external_url(CFG_SITE_URL,
                                   temp_path,
                                   content_type='pdf')
             self.fail()
         except InvenioFileDownloadError:
             pass
     finally:
         os.unlink(temp_path)
Beispiel #5
0
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message('fetching %s' % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace('/', '_')
        temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker",
                                       dir=CFG_TMPSHAREDDIR,
                                       suffix="%s.pdf" % filename_arxiv_id)
        write_message('downloading pdf from %s' % url_for_pdf)
        path = download_external_url(url_for_pdf,
                                     temp_file.name,
                                     content_type='pdf')

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if 'PDF unavailable' in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode('utf-8'))
            if new_md5 != existing_md5:
                write_message('md5 differs updating')
                needs_update = True
            else:
                write_message('md5 matches existing pdf, skipping')

        if needs_update:
            if bibdocfiles:
                write_message('adding as new version')
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message('adding as new file')
                docs.add_new_file(path,
                                  doctype="arXiv",
                                  docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message('fetching %s' % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace('/', '_')
        temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker",
                                       dir=CFG_TMPSHAREDDIR,
                                       suffix="%s.pdf" % filename_arxiv_id)
        write_message('downloading pdf from %s' % url_for_pdf)
        path = download_external_url(url_for_pdf,
                                     temp_file.name,
                                     content_type='pdf')

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if 'PDF unavailable' in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode('utf-8'))
            if new_md5 != existing_md5:
                write_message('md5 differs updating')
                needs_update = True
            else:
                write_message('md5 matches existing pdf, skipping')

        if needs_update:
            if bibdocfiles:
                write_message('adding as new version')
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message('adding as new file')
                docs.add_new_file(path,
                                  doctype="arXiv",
                                  docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()