Python get_pdf_from_arxivの例、invenio.utils.plotextractor.api.get_pdf_from_arxiv Pythonの例

コード例 #1

0

ファイルを表示

ファイル: arxiv.py プロジェクト: jma/inspire-next

def get_pdf_for_model(eng, arxiv_id):
    """We download it."""
    extract_path = os.path.join(
        cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
        str(eng.uuid)
    )
    return get_pdf_from_arxiv(
        arxiv_id,
        extract_path
    )

コード例 #2

0

ファイルを表示

def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.utils.plotextractor.api import get_pdf_from_arxiv

    if "result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    if "pdf" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid))
        pdf = get_pdf_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path)
        arguments = obj.extra_data["repository"]["arguments"]
        try:
            if not arguments['t_doctype'] == '':
                doctype = arguments['t_doctype']
            else:
                doctype = 'arXiv'
        except KeyError:
            eng.log.error("WARNING: HASARDOUS BEHAVIOUR EXPECTED, "
                          "You didn't specified t_doctype in argument"
                          " for fulltext_download,"
                          "try to recover by using the default one!")
            doctype = 'arXiv'
        if pdf:
            obj.extra_data["_result"]["pdf"] = pdf
            new_dict_representation = {
                "fft": [{
                    "url": pdf,
                    "docfile_type": doctype
                }]
            }
            _attach_files_to_obj(obj, new_dict_representation)
            fileinfo = {
                "type": "fulltext",
                "filename": os.path.basename(pdf),
                "full_path": pdf,
            }
            obj.update_task_results(
                "PDF", [{
                    "name": "PDF",
                    "result": fileinfo,
                    "template": "workflows/results/fft.html"
                }])
        else:
            obj.log.info("No PDF found.")
    else:
        eng.log.info("There was already a pdf register for this record,"
                     "perhaps a duplicate task in you workflow.")

コード例 #3

0

ファイルを表示

ファイル: arxiv.py プロジェクト: jalavik/invenio-oaiharvester

def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml
    from invenio.utils.plotextractor.api import get_pdf_from_arxiv
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    try:
        pdf = obj.extra_data["_result"]["pdf"]
    except KeyError:
        pdf = None

    if not pdf:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid)
        )
        pdf = get_pdf_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path
        )
        obj.extra_data["_result"]["pdf"] = pdf

    if pdf and os.path.isfile(pdf):
        references_xml = extract_references_from_file_xml(
            obj.extra_data["_result"]["pdf"]
        )
        if references_xml:
            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \
                          '<collection>\n' + references_xml + \
                          "\n</collection>"
            new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
            if "reference" in new_dict_representation:
                obj.data["reference"] = new_dict_representation["reference"]
                obj.log.info("Extracted {0} references".format(len(obj.data["reference"])))
                obj.update_task_results(
                    "References",
                    [{"name": "References",
                      "result": new_dict_representation['reference'],
                      "template": "workflows/results/refextract.html"}]
                )
                return
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")

コード例 #4

0

ファイルを表示

ファイル: arxiv.py プロジェクト: jmartinm/invenio-oaiharvester

def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml
    from invenio.utils.plotextractor.api import get_pdf_from_arxiv
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    try:
        pdf = obj.extra_data["_result"]["pdf"]
    except KeyError:
        pdf = None

    if not pdf:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid))
        pdf = get_pdf_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path)
        obj.extra_data["_result"]["pdf"] = pdf

    if pdf and os.path.isfile(pdf):
        references_xml = extract_references_from_file_xml(
            obj.extra_data["_result"]["pdf"])
        if references_xml:
            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \
                          '<collection>\n' + references_xml + \
                          "\n</collection>"
            new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
            if "reference" in new_dict_representation:
                obj.data["reference"] = new_dict_representation["reference"]
                obj.log.info("Extracted {0} references".format(
                    len(obj.data["reference"])))
                obj.update_task_results(
                    "References",
                    [{
                        "name": "References",
                        "result": new_dict_representation['reference'],
                        "template": "workflows/results/refextract.html"
                    }])
                return
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")

コード例 #5

0

ファイルを表示

ファイル: arxiv.py プロジェクト: jalavik/invenio-oaiharvester

    def _arxiv_fulltext_download(obj, eng):
        """Perform the fulltext download step for arXiv records.

        :param obj: Bibworkflow Object to process
        :param eng: BibWorkflowEngine processing the object
        """
        from invenio.utils.plotextractor.api import get_pdf_from_arxiv

        if "result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        if "pdf" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
                str(eng.uuid)
            )
            pdf = get_pdf_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path
            )

            if pdf:
                obj.extra_data["_result"]["pdf"] = pdf
                new_dict_representation = {
                    "fft": [
                        {
                            "url": pdf,
                            "docfile_type": doctype
                        }
                    ]
                }
                _attach_files_to_obj(obj, new_dict_representation)
                fileinfo = {
                    "type": "fulltext",
                    "filename": os.path.basename(pdf),
                    "full_path": pdf,
                }
                obj.update_task_results(
                    os.path.basename(pdf),
                    [{
                        "name": "PDF",
                        "result": fileinfo,
                        "template": "workflows/results/files.html"
                    }]
                )
            else:
                obj.log.info("No PDF found.")
        else:
            eng.log.info("There was already a pdf register for this record,"
                         "perhaps a duplicate task in you workflow.")

コード例 #6

0

ファイルを表示

ファイル: arxiv.py プロジェクト: jmartinm/invenio-oaiharvester

    def _arxiv_fulltext_download(obj, eng):
        """Perform the fulltext download step for arXiv records.

        :param obj: Bibworkflow Object to process
        :param eng: BibWorkflowEngine processing the object
        """
        from invenio.utils.plotextractor.api import get_pdf_from_arxiv

        if "result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        if "pdf" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR',
                        cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid))
            pdf = get_pdf_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path)

            if pdf:
                obj.extra_data["_result"]["pdf"] = pdf
                new_dict_representation = {
                    "fft": [{
                        "url": pdf,
                        "docfile_type": doctype
                    }]
                }
                _attach_files_to_obj(obj, new_dict_representation)
                fileinfo = {
                    "type": "fulltext",
                    "filename": os.path.basename(pdf),
                    "full_path": pdf,
                }
                obj.update_task_results(
                    os.path.basename(pdf),
                    [{
                        "name": "PDF",
                        "result": fileinfo,
                        "template": "workflows/results/files.html"
                    }])
            else:
                obj.log.info("No PDF found.")
        else:
            eng.log.info("There was already a pdf register for this record,"
                         "perhaps a duplicate task in you workflow.")

コード例 #7

0

ファイルを表示

ファイル: postprocess.py プロジェクト: jirikuncar/invenio-oaiharvester

def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.utils.plotextractor.api import get_pdf_from_arxiv

    if "result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    if "pdf" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid)
        )
        pdf = get_pdf_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path
        )
        arguments = obj.extra_data["repository"]["arguments"]
        try:
            if not arguments['t_doctype'] == '':
                doctype = arguments['t_doctype']
            else:
                doctype = 'arXiv'
        except KeyError:
            eng.log.error("WARNING: HASARDOUS BEHAVIOUR EXPECTED, "
                          "You didn't specified t_doctype in argument"
                          " for fulltext_download,"
                          "try to recover by using the default one!")
            doctype = 'arXiv'
        if pdf:
            obj.extra_data["_result"]["pdf"] = pdf
            new_dict_representation = {
                "fft": [
                    {
                        "url": pdf,
                        "docfile_type": doctype
                    }
                ]
            }
            _attach_files_to_obj(obj, new_dict_representation)
            fileinfo = {
                "type": "fulltext",
                "filename": os.path.basename(pdf),
                "full_path": pdf,
            }
            obj.update_task_results(
                "PDF",
                [{
                    "name": "PDF",
                    "result": fileinfo,
                    "template": "workflows/results/fft.html"
                }]
            )
        else:
            obj.log.info("No PDF found.")
    else:
        eng.log.info("There was already a pdf register for this record,"
                     "perhaps a duplicate task in you workflow.")