def submission_fulltext_download(obj, eng): submission_pdf = obj.extra_data.get('submission_pdf') if submission_pdf and is_pdf_link(submission_pdf): filename = secure_filename('fulltext.pdf') pdf = download_file_to_workflow( workflow=obj, name=filename, url=submission_pdf, ) if pdf: obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source=obj.data['acquisition_source']['source'], record=obj.data) lb.add_document( filename, fulltext=True, original_url=submission_pdf, url='/api/files/{bucket}/{key}'.format(bucket=obj.files[filename].bucket_id, key=filename) ) obj.data = lb.record obj.log.info('PDF provided by user from %s', submission_pdf) return obj.files[filename].file.uri else: obj.log.info('Cannot fetch PDF provided by user from %s', submission_pdf)
def test_download_file_to_workflow_retries_on_protocol_error(): with requests_mock.Mocker() as requests_mocker: filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1605.03844.pdf')) requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', [ { 'exc': requests.packages.urllib3.exceptions.ProtocolError }, { 'body': filename, 'status_code': 200 }, ]) data = {} extra_data = {} files = MockFiles({}) obj = MockObj(data, extra_data, files=files) expected = MockFileObject(key='1605.03844.pdf') result = download_file_to_workflow( obj, '1605.03844.pdf', 'http://export.arxiv.org/pdf/1605.03844') assert expected == result
def download_documents(obj, eng): documents = obj.data.get('documents', []) for document in documents: filename = document['key'] url = document['url'] downloaded = download_file_to_workflow( workflow=obj, name=filename, url=url, ) if downloaded: document['url'] = '/api/files/{bucket}/{key}'.format( bucket=obj.files[filename].bucket_id, key=filename) obj.log.info('Document downloaded from %s', url) else: obj.log.error('Cannot download document from %s', url)
def download_documents(obj, eng): documents = obj.data.get('documents', []) for document in documents: filename = document['key'] url = document['url'] downloaded = download_file_to_workflow( workflow=obj, name=filename, url=url, ) if downloaded: document['url'] = '/api/files/{bucket}/{key}'.format( bucket=obj.files[filename].bucket_id, key=filename) obj.log.info('Document downloaded from %s', url) else: obj.log.error( 'Cannot download document from %s', url)
def arxiv_package_download(obj, eng): """Perform the package download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = download_file_to_workflow( workflow=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id), ) if tarball: obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id) else: obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.pdf'.format(arxiv_id)) url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id) if not is_pdf_link(url): if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return raise DownloadError("{url} is not serving a PDF file.".format(url=url)) pdf = download_file_to_workflow( workflow=obj, name=filename, url=url, ) if pdf: obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document(filename, fulltext=True, hidden=True, material='preprint', original_url=url, url='/api/files/{bucket}/{key}'.format( bucket=obj.files[filename].bucket_id, key=filename)) obj.data = lb.record obj.log.info('PDF retrieved from arXiv for %s', arxiv_id) else: obj.log.error('Cannot retrieve PDF from arXiv for %s', arxiv_id)
def test_download_file_to_workflow_retries_on_protocol_error(): with requests_mock.Mocker() as requests_mocker: filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1605.03844.pdf')) requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', [ {'exc': requests.packages.urllib3.exceptions.ProtocolError}, {'body': filename, 'status_code': 200}, ]) data = {} extra_data = {} files = MockFiles({}) obj = MockObj(data, extra_data, files=files) expected = MockFileObject(key='1605.03844.pdf') result = download_file_to_workflow( obj, '1605.03844.pdf', 'http://export.arxiv.org/pdf/1605.03844') assert expected == result