def _convert_record(obj, eng): """ Will convert the object data, if XML, using the given stylesheet """ from invenio.legacy.bibconvert.xslt_engine import convert eng.log.info("Starting conversion using %s stylesheet" % (stylesheet,)) try: obj.data = convert(obj.data, stylesheet) except Exception as e: msg = "Could not convert record: %s\n%s" % \ (str(e), traceback.format_exc()) obj.extra_data["_error_msg"] = msg raise workflows_error.WorkflowError("Error: %s" % (msg,), id_workflow=eng.uuid, id_object=obj.id)
def _convert_record(obj, eng): from invenio.modules.workflows.errors import WorkflowError from invenio.legacy.bibconvert.xslt_engine import convert eng.log.info("Starting conversion using %s stylesheet" % (stylesheet,)) if not obj.data: obj.log.error("Not valid conversion data!") raise WorkflowError("Error: conversion data missing", id_workflow=eng.uuid, id_object=obj.id) try: obj.data = convert(obj.data, stylesheet) except Exception as e: msg = "Could not convert record: %s\n%s" % \ (str(e), traceback.format_exc()) raise WorkflowError("Error: %s" % (msg,), id_workflow=eng.uuid, id_object=obj.id)
def _convert_record(obj, eng): from invenio.modules.workflows.errors import WorkflowError from invenio.legacy.bibconvert.xslt_engine import convert eng.log.info("Starting conversion using %s stylesheet" % (stylesheet, )) if not obj.data: obj.log.error("Not valid conversion data!") raise WorkflowError("Error: conversion data missing", id_workflow=eng.uuid, id_object=obj.id) try: obj.data = convert(obj.data, stylesheet) except Exception as e: msg = "Could not convert record: %s\n%s" % \ (str(e), traceback.format_exc()) raise WorkflowError("Error: %s" % (msg, ), id_workflow=eng.uuid, id_object=obj.id)
def get_marcxml_for_doi(doi): """ Send doi to the http://www.crossref.org/openurl page. Attaches parameters: username, password, doi and noredirect. Returns the MARCXML code or throws an exception, when 1. DOI is malformed 2. Record not found """ if not CFG_CROSSREF_USERNAME and not CFG_CROSSREF_PASSWORD: raise CrossrefError("error_crossref_no_account") # Clean the DOI doi = doi.strip() # Getting the data from external source url = "http://www.crossref.org/openurl/?pid=" + CFG_CROSSREF_USERNAME \ + ":" + CFG_CROSSREF_PASSWORD + "&noredirect=tru&id=doi:" + doi request = urllib2.Request(url) response = CROSSREF_OPENER.open(request) header = response.info().getheader('Content-Type') content = response.read() # Check if the returned page is html - this means the DOI is malformed if "text/html" in header: raise CrossrefError("error_crossref_malformed_doi") if 'status="unresolved"' in content: raise CrossrefError("error_crossref_record_not_found") # Convert xml to marc using convert function # from bibconvert_xslt_engine file # Seting the path to xsl template xsl_crossref2marc_config = templates.get('crossref2marcxml.xsl', '') output = convert(xmltext=content, \ template_filename=xsl_crossref2marc_config) return output
def author_list(obj, eng): """ Performs the special authorlist extraction step (Mostly INSPIRE/CERN related). :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex, find_matching_files) from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.cli import get_defaults identifiers = obj.data["system_number_external"]["value"] bibtask.task_sleep_now_if_required() if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid) tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["tarball"]) tarball = str(tarball) if tarball is None: raise workflows_error.WorkflowError(str("Error harvesting tarball from id: %s %s" % (identifiers, extract_path)), eng.uuid, id_object=obj.id) obj.extra_data["_result"]["tarball"] = tarball sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "") try: untar(obj.extra_data["_result"]["tarball"], sub_dir) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if not match == []: authors += match[0] # Generate file to store conversion results if authors is not '': authors = convert(authors, "authorlist2marcxml.xsl") authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % (identifiers,)) authorlist_record = authorlist_record[0][0] # Convert any LaTeX symbols in authornames translate_fieldvalues_from_latex(authorlist_record, '100', code='a') translate_fieldvalues_from_latex(authorlist_record, '700', code='a') updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' + record_xml_output(authorlist_record) \ + '</collection>' if not None == updated_xml: # We store the path to the directory the tarball contents live # Read and grab MARCXML from plotextractor run new_dict_representation = records_api.create_record(updated_xml, master_format="marc").dumps() obj.data['authors'] = new_dict_representation["authors"] obj.data['number_of_authors'] = new_dict_representation["number_of_authors"] obj.add_task_result("authors", new_dict_representation["authors"]) obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"])
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get( cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error( "Error parsing authorlist record for id: %s" % (identifiers, )) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield( updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }]) obj.update_task_results("number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }]) break
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % ( obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % ( identifiers,)) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield(updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }] ) break
def author_list(obj, eng): """Perform the special authorlist extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex, find_matching_files) from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.getter import harvest_single from invenio.modules.workflows.errors import WorkflowError from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout identifiers = obj.data["system_control_number"]["value"] if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg['CFG_TMPSHAREDDIR'], str(eng.uuid) ) if not os.path.exists(extract_path): os.makedirs(extract_path) tarball, pdf = harvest_single( obj.data["system_control_number"]["value"], extract_path, ["tarball"]) tarball = str(tarball) if tarball is None: raise WorkflowError(str( "Error harvesting tarball from id: %s %s" % ( identifiers, extract_path)), eng.uuid, id_object=obj.id) obj.extra_data["_result"]["tarball"] = tarball sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "") try: untar(obj.extra_data["_result"]["tarball"], sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % ( obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") a_stylesheet = obj.extra_data["repository"]["arguments"].get( "a_stylesheet" ) or "authorlist2marcxml.xsl" authors = convert(xml_content, a_stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % ( identifiers,)) authorlist_record = authorlist_record[0][0] # Convert any LaTeX symbols in authornames translate_fieldvalues_from_latex(authorlist_record, '100', code='a') translate_fieldvalues_from_latex(authorlist_record, '700', code='a') updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' if not None == updated_xml: # We store the path to the directory the tarball contents live # Read and grab MARCXML from plotextractor run new_dict_representation = convert_marcxml_to_bibfield(updated_xml) obj.data['authors'] = new_dict_representation["authors"] obj.data['number_of_authors'] = new_dict_representation[ "number_of_authors"] obj.add_task_result("authors", new_dict_representation["authors"]) obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"]) break