def arxiv_refextract(obj, eng): """Extract references from arXiv PDF. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.pdf".format(arxiv_id)) if filename not in obj.files: pdf = download_file_to_workflow( workflow=obj, name=filename, url=current_app.config['ARXIV_PDF_URL'].format( arxiv_id=arxiv_id ) ) else: pdf = obj.files[filename] if pdf: mapped_references = extract_references(pdf.file.uri) if mapped_references: obj.data["references"] = mapped_references obj.log.info("Extracted {0} references".format( len(mapped_references) )) else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_refextract(obj, eng): """Extract references from arXiv PDF. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.pdf".format(arxiv_id)) if filename not in obj.files: pdf = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_PDF_URL'].format( arxiv_id=arxiv_id ) ) else: pdf = obj.files[filename] if pdf: mapped_references = extract_references(pdf.file.uri) if mapped_references: # FIXME For now we do not add these references to the final record. obj.extra_data["references"] = mapped_references obj.log.info("Extracted {0} references".format( len(mapped_references) )) else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id)) if not existing_file: # We download it pdf = get_pdf_for_model(eng, arxiv_id) if pdf is None: obj.log.error("No pdf found") return add_file_by_name(model, pdf) else: pdf = existing_file.get_syspath() if pdf and os.path.isfile(pdf): mapped_references = extract_references(pdf) if mapped_references: record["references"] = mapped_references obj.log.info("Extracted {0} references".format( len(mapped_references) )) obj.update_task_results( "References", [{"name": "References", "result": mapped_references, "template": "workflows/results/refextract.html"}] ) model.update() else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")