def populate_arxiv_document(obj, eng): arxiv_id = LiteratureReader(obj.data).arxiv_id for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'): url = current_app.config[conf_name].format(arxiv_id=arxiv_id) is_valid_pdf_link = is_pdf_link(url) if is_valid_pdf_link: break try: if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return except requests.exceptions.RequestException: raise DownloadError("Error accessing url {url}".format(url=url)) if not is_valid_pdf_link: raise DownloadError("{url} is not serving a PDF file.".format(url=url)) filename = secure_filename('{0}.pdf'.format(arxiv_id)) obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document( filename, fulltext=True, hidden=True, material='preprint', original_url=url, url=url, ) obj.data = lb.record
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.pdf'.format(arxiv_id)) url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id) if not is_pdf_link(url): if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return raise DownloadError("{url} is not serving a PDF file.".format(url=url)) pdf = download_file_to_workflow( workflow=obj, name=filename, url=url, ) if pdf: obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document(filename, fulltext=True, hidden=True, material='preprint', original_url=url, url='/api/files/{bucket}/{key}'.format( bucket=obj.files[filename].bucket_id, key=filename)) obj.data = lb.record obj.log.info('PDF retrieved from arXiv for %s', arxiv_id) else: obj.log.error('Cannot retrieve PDF from arXiv for %s', arxiv_id)