def get_pdf_for_model(eng, arxiv_id): """We download it.""" extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) return get_pdf_from_arxiv( arxiv_id, extract_path )
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.utils.plotextractor.api import get_pdf_from_arxiv if "result" not in obj.extra_data: obj.extra_data["_result"] = {} if "pdf" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) arguments = obj.extra_data["repository"]["arguments"] try: if not arguments['t_doctype'] == '': doctype = arguments['t_doctype'] else: doctype = 'arXiv' except KeyError: eng.log.error("WARNING: HASARDOUS BEHAVIOUR EXPECTED, " "You didn't specified t_doctype in argument" " for fulltext_download," "try to recover by using the default one!") doctype = 'arXiv' if pdf: obj.extra_data["_result"]["pdf"] = pdf new_dict_representation = { "fft": [{ "url": pdf, "docfile_type": doctype }] } _attach_files_to_obj(obj, new_dict_representation) fileinfo = { "type": "fulltext", "filename": os.path.basename(pdf), "full_path": pdf, } obj.update_task_results( "PDF", [{ "name": "PDF", "result": fileinfo, "template": "workflows/results/fft.html" }]) else: obj.log.info("No PDF found.") else: eng.log.info("There was already a pdf register for this record," "perhaps a duplicate task in you workflow.")
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml from invenio.utils.plotextractor.api import get_pdf_from_arxiv from invenio.modules.workflows.utils import convert_marcxml_to_bibfield if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} try: pdf = obj.extra_data["_result"]["pdf"] except KeyError: pdf = None if not pdf: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) obj.extra_data["_result"]["pdf"] = pdf if pdf and os.path.isfile(pdf): references_xml = extract_references_from_file_xml( obj.extra_data["_result"]["pdf"] ) if references_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \ '<collection>\n' + references_xml + \ "\n</collection>" new_dict_representation = convert_marcxml_to_bibfield(updated_xml) if "reference" in new_dict_representation: obj.data["reference"] = new_dict_representation["reference"] obj.log.info("Extracted {0} references".format(len(obj.data["reference"]))) obj.update_task_results( "References", [{"name": "References", "result": new_dict_representation['reference'], "template": "workflows/results/refextract.html"}] ) return else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml from invenio.utils.plotextractor.api import get_pdf_from_arxiv from invenio.modules.workflows.utils import convert_marcxml_to_bibfield if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} try: pdf = obj.extra_data["_result"]["pdf"] except KeyError: pdf = None if not pdf: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) obj.extra_data["_result"]["pdf"] = pdf if pdf and os.path.isfile(pdf): references_xml = extract_references_from_file_xml( obj.extra_data["_result"]["pdf"]) if references_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \ '<collection>\n' + references_xml + \ "\n</collection>" new_dict_representation = convert_marcxml_to_bibfield(updated_xml) if "reference" in new_dict_representation: obj.data["reference"] = new_dict_representation["reference"] obj.log.info("Extracted {0} references".format( len(obj.data["reference"]))) obj.update_task_results( "References", [{ "name": "References", "result": new_dict_representation['reference'], "template": "workflows/results/refextract.html" }]) return else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def _arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.utils.plotextractor.api import get_pdf_from_arxiv if "result" not in obj.extra_data: obj.extra_data["_result"] = {} if "pdf" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if pdf: obj.extra_data["_result"]["pdf"] = pdf new_dict_representation = { "fft": [ { "url": pdf, "docfile_type": doctype } ] } _attach_files_to_obj(obj, new_dict_representation) fileinfo = { "type": "fulltext", "filename": os.path.basename(pdf), "full_path": pdf, } obj.update_task_results( os.path.basename(pdf), [{ "name": "PDF", "result": fileinfo, "template": "workflows/results/files.html" }] ) else: obj.log.info("No PDF found.") else: eng.log.info("There was already a pdf register for this record," "perhaps a duplicate task in you workflow.")
def _arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.utils.plotextractor.api import get_pdf_from_arxiv if "result" not in obj.extra_data: obj.extra_data["_result"] = {} if "pdf" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if pdf: obj.extra_data["_result"]["pdf"] = pdf new_dict_representation = { "fft": [{ "url": pdf, "docfile_type": doctype }] } _attach_files_to_obj(obj, new_dict_representation) fileinfo = { "type": "fulltext", "filename": os.path.basename(pdf), "full_path": pdf, } obj.update_task_results( os.path.basename(pdf), [{ "name": "PDF", "result": fileinfo, "template": "workflows/results/files.html" }]) else: obj.log.info("No PDF found.") else: eng.log.info("There was already a pdf register for this record," "perhaps a duplicate task in you workflow.")
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.utils.plotextractor.api import get_pdf_from_arxiv if "result" not in obj.extra_data: obj.extra_data["_result"] = {} if "pdf" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) arguments = obj.extra_data["repository"]["arguments"] try: if not arguments['t_doctype'] == '': doctype = arguments['t_doctype'] else: doctype = 'arXiv' except KeyError: eng.log.error("WARNING: HASARDOUS BEHAVIOUR EXPECTED, " "You didn't specified t_doctype in argument" " for fulltext_download," "try to recover by using the default one!") doctype = 'arXiv' if pdf: obj.extra_data["_result"]["pdf"] = pdf new_dict_representation = { "fft": [ { "url": pdf, "docfile_type": doctype } ] } _attach_files_to_obj(obj, new_dict_representation) fileinfo = { "type": "fulltext", "filename": os.path.basename(pdf), "full_path": pdf, } obj.update_task_results( "PDF", [{ "name": "PDF", "result": fileinfo, "template": "workflows/results/fft.html" }] ) else: obj.log.info("No PDF found.") else: eng.log.info("There was already a pdf register for this record," "perhaps a duplicate task in you workflow.")