コード例 #1
0
ファイル: marcxml_tasks.py プロジェクト: mhellmic/b2share
    def _convert_record(obj, eng):
        """
        Will convert the object data, if XML, using the given stylesheet
        """
        from invenio.legacy.bibconvert.xslt_engine import convert

        eng.log.info("Starting conversion using %s stylesheet" %
                     (stylesheet,))

        try:
            obj.data = convert(obj.data, stylesheet)
        except Exception as e:
            msg = "Could not convert record: %s\n%s" % \
                  (str(e), traceback.format_exc())
            obj.extra_data["_error_msg"] = msg
            raise workflows_error.WorkflowError("Error: %s" % (msg,),
                                                id_workflow=eng.uuid,
                                                id_object=obj.id)
コード例 #2
0
ファイル: marcxml_tasks.py プロジェクト: dset0x/invenio
    def _convert_record(obj, eng):
        from invenio.modules.workflows.errors import WorkflowError
        from invenio.legacy.bibconvert.xslt_engine import convert

        eng.log.info("Starting conversion using %s stylesheet" %
                     (stylesheet,))

        if not obj.data:
            obj.log.error("Not valid conversion data!")
            raise WorkflowError("Error: conversion data missing",
                                id_workflow=eng.uuid,
                                id_object=obj.id)

        try:
            obj.data = convert(obj.data, stylesheet)
        except Exception as e:
            msg = "Could not convert record: %s\n%s" % \
                  (str(e), traceback.format_exc())
            raise WorkflowError("Error: %s" % (msg,),
                                id_workflow=eng.uuid,
                                id_object=obj.id)
コード例 #3
0
ファイル: marcxml_tasks.py プロジェクト: chokribr/invenio-1
    def _convert_record(obj, eng):
        from invenio.modules.workflows.errors import WorkflowError
        from invenio.legacy.bibconvert.xslt_engine import convert

        eng.log.info("Starting conversion using %s stylesheet" %
                     (stylesheet, ))

        if not obj.data:
            obj.log.error("Not valid conversion data!")
            raise WorkflowError("Error: conversion data missing",
                                id_workflow=eng.uuid,
                                id_object=obj.id)

        try:
            obj.data = convert(obj.data, stylesheet)
        except Exception as e:
            msg = "Could not convert record: %s\n%s" % \
                  (str(e), traceback.format_exc())
            raise WorkflowError("Error: %s" % (msg, ),
                                id_workflow=eng.uuid,
                                id_object=obj.id)
コード例 #4
0
ファイル: crossref.py プロジェクト: SCOAP3/invenio
def get_marcxml_for_doi(doi):
    """
    Send doi to the http://www.crossref.org/openurl page.
    Attaches parameters: username, password, doi and noredirect.
    Returns the MARCXML code or throws an exception, when
    1. DOI is malformed
    2. Record not found
    """
    if not CFG_CROSSREF_USERNAME and not CFG_CROSSREF_PASSWORD:
        raise CrossrefError("error_crossref_no_account")

    # Clean the DOI
    doi = doi.strip()

    # Getting the data from external source
    url = "http://www.crossref.org/openurl/?pid=" +  CFG_CROSSREF_USERNAME \
        + ":" + CFG_CROSSREF_PASSWORD + "&noredirect=tru&id=doi:" + doi
    request = urllib2.Request(url)
    response = CROSSREF_OPENER.open(request)
    header = response.info().getheader('Content-Type')
    content = response.read()

    # Check if the returned page is html - this means the DOI is malformed
    if "text/html" in header:
        raise CrossrefError("error_crossref_malformed_doi")
    if 'status="unresolved"' in content:
        raise CrossrefError("error_crossref_record_not_found")

    # Convert xml to marc using convert function
    # from bibconvert_xslt_engine file
    # Seting the path to xsl template
    xsl_crossref2marc_config = templates.get('crossref2marcxml.xsl', '')

    output = convert(xmltext=content, \
                    template_filename=xsl_crossref2marc_config)
    return output
コード例 #5
0
def get_marcxml_for_doi(doi):
    """
    Send doi to the http://www.crossref.org/openurl page.
    Attaches parameters: username, password, doi and noredirect.
    Returns the MARCXML code or throws an exception, when
    1. DOI is malformed
    2. Record not found
    """
    if not CFG_CROSSREF_USERNAME and not CFG_CROSSREF_PASSWORD:
        raise CrossrefError("error_crossref_no_account")

    # Clean the DOI
    doi = doi.strip()

    # Getting the data from external source
    url = "http://www.crossref.org/openurl/?pid=" +  CFG_CROSSREF_USERNAME \
        + ":" + CFG_CROSSREF_PASSWORD + "&noredirect=tru&id=doi:" + doi
    request = urllib2.Request(url)
    response = CROSSREF_OPENER.open(request)
    header = response.info().getheader('Content-Type')
    content = response.read()

    # Check if the returned page is html - this means the DOI is malformed
    if "text/html" in header:
        raise CrossrefError("error_crossref_malformed_doi")
    if 'status="unresolved"' in content:
        raise CrossrefError("error_crossref_record_not_found")

    # Convert xml to marc using convert function
    # from bibconvert_xslt_engine file
    # Seting the path to xsl template
    xsl_crossref2marc_config = templates.get('crossref2marcxml.xsl', '')

    output = convert(xmltext=content, \
                    template_filename=xsl_crossref2marc_config)
    return output
コード例 #6
0
ファイル: marcxml_tasks.py プロジェクト: mhellmic/b2share
def author_list(obj, eng):
    """
    Performs the special authorlist extraction step (Mostly INSPIRE/CERN related).

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex,
                                                 find_matching_files)
    from invenio.legacy.bibrecord import create_records, record_xml_output
    from invenio.legacy.bibconvert.xslt_engine import convert
    from invenio.utils.plotextractor.cli import get_defaults

    identifiers = obj.data["system_number_external"]["value"]
    bibtask.task_sleep_now_if_required()
    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}
    if "tarball" not in obj.extra_data["_result"]:
        extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid)
        tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["tarball"])
        tarball = str(tarball)
        if tarball is None:
            raise workflows_error.WorkflowError(str("Error harvesting tarball from id: %s %s" % (identifiers, extract_path)),
                                                eng.uuid,
                                                id_object=obj.id)
        obj.extra_data["_result"]["tarball"] = tarball

    sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "")

    try:
        untar(obj.extra_data["_result"]["tarball"], sub_dir)
    except Timeout:
        eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"]))

    xml_files_list = find_matching_files(sub_dir, ["xml"])

    authors = ""

    for xml_file in xml_files_list:
        xml_file_fd = open(xml_file, "r")
        xml_content = xml_file_fd.read()
        xml_file_fd.close()

        match = REGEXP_AUTHLIST.findall(xml_content)
        if not match == []:
            authors += match[0]
            # Generate file to store conversion results
    if authors is not '':
        authors = convert(authors, "authorlist2marcxml.xsl")
        authorlist_record = create_records(authors)
        if len(authorlist_record) == 1:
            if authorlist_record[0][0] is None:
                eng.log.error("Error parsing authorlist record for id: %s" % (identifiers,))
            authorlist_record = authorlist_record[0][0]
            # Convert any LaTeX symbols in authornames
        translate_fieldvalues_from_latex(authorlist_record, '100', code='a')
        translate_fieldvalues_from_latex(authorlist_record, '700', code='a')

        updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' + record_xml_output(authorlist_record) \
                      + '</collection>'
        if not None == updated_xml:
            # We store the path to the directory  the tarball contents live
            # Read and grab MARCXML from plotextractor run
            new_dict_representation = records_api.create_record(updated_xml, master_format="marc").dumps()
            obj.data['authors'] = new_dict_representation["authors"]
            obj.data['number_of_authors'] = new_dict_representation["number_of_authors"]
            obj.add_task_result("authors", new_dict_representation["authors"])
            obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"])
コード例 #7
0
    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(
            cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR',
                        cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid))
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path)
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' %
                          (obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error(
                            "Error parsing authorlist record for id: %s" %
                            (identifiers, ))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(
                        updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }])
                    obj.update_task_results("number_of_authors", [{
                        "name":
                        "number_of_authors",
                        "results":
                        new_dict_representation["number_of_authors"]
                    }])
                    break
コード例 #8
0
ファイル: arxiv.py プロジェクト: jalavik/invenio-oaiharvester
    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
                str(eng.uuid)
            )
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path
            )
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball,
                                      cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' % (
                obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error("Error parsing authorlist record for id: %s" % (
                            identifiers,))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }]
                    )
                    obj.update_task_results(
                        "number_of_authors",
                        [{
                            "name": "number_of_authors",
                            "results": new_dict_representation["number_of_authors"]
                        }]
                    )
                    break
コード例 #9
0
def author_list(obj, eng):
    """Perform the special authorlist extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex,
                                                 find_matching_files)
    from invenio.legacy.bibrecord import create_records, record_xml_output
    from invenio.legacy.bibconvert.xslt_engine import convert
    from invenio.utils.plotextractor.cli import get_defaults
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.plotextractor.getter import harvest_single
    from invenio.modules.workflows.errors import WorkflowError
    from invenio.utils.plotextractor.converter import untar
    from invenio.utils.shell import Timeout

    identifiers = obj.data["system_control_number"]["value"]
    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}
    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg['CFG_TMPSHAREDDIR'],
            str(eng.uuid)
        )
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        tarball, pdf = harvest_single(
            obj.data["system_control_number"]["value"], extract_path,
            ["tarball"])
        tarball = str(tarball)
        if tarball is None:
            raise WorkflowError(str(
                "Error harvesting tarball from id: %s %s" % (
                    identifiers, extract_path)), eng.uuid, id_object=obj.id)
        obj.extra_data["_result"]["tarball"] = tarball

    sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"],
                                  cfg['CFG_TMPDIR'], "")

    try:
        untar(obj.extra_data["_result"]["tarball"], sub_dir)
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))
    except Timeout:
        eng.log.error('Timeout during tarball extraction on %s' % (
            obj.extra_data["_result"]["tarball"]))

    xml_files_list = find_matching_files(sub_dir, ["xml"])

    obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

    authors = ""

    for xml_file in xml_files_list:
        xml_file_fd = open(xml_file, "r")
        xml_content = xml_file_fd.read()
        xml_file_fd.close()

        match = REGEXP_AUTHLIST.findall(xml_content)
        if match:
            obj.log.info("Found a match for author extraction")

            a_stylesheet = obj.extra_data["repository"]["arguments"].get(
                "a_stylesheet"
            ) or "authorlist2marcxml.xsl"
            authors = convert(xml_content, a_stylesheet)
            authorlist_record = create_records(authors)
            if len(authorlist_record) == 1:
                if authorlist_record[0][0] is None:
                    eng.log.error("Error parsing authorlist record for id: %s" % (
                        identifiers,))
                authorlist_record = authorlist_record[0][0]
                # Convert any LaTeX symbols in authornames
            translate_fieldvalues_from_latex(authorlist_record, '100', code='a')
            translate_fieldvalues_from_latex(authorlist_record, '700', code='a')

            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                          + record_xml_output(authorlist_record) + '</collection>'
            if not None == updated_xml:
                # We store the path to the directory  the tarball contents live
                # Read and grab MARCXML from plotextractor run
                new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                obj.data['authors'] = new_dict_representation["authors"]
                obj.data['number_of_authors'] = new_dict_representation[
                    "number_of_authors"]
                obj.add_task_result("authors", new_dict_representation["authors"])
                obj.add_task_result("number_of_authors",
                                    new_dict_representation["number_of_authors"])
                break