Ejemplo n.º 1
0
def arxiv_refextract(obj, eng):
    """Extract references from arXiv PDF.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_workflow(
            workflow=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(
                arxiv_id=arxiv_id
            )
        )
    else:
        pdf = obj.files[filename]
    if pdf:
        mapped_references = extract_references(pdf.file.uri)
        if mapped_references:
            obj.data["references"] = mapped_references
            obj.log.info("Extracted {0} references".format(
                len(mapped_references)
            ))
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
Ejemplo n.º 2
0
def arxiv_refextract(obj, eng):
    """Extract references from arXiv PDF.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(
                arxiv_id=arxiv_id
            )
        )
    else:
        pdf = obj.files[filename]
    if pdf:
        mapped_references = extract_references(pdf.file.uri)
        if mapped_references:
            # FIXME For now we do not add these references to the final record.
            obj.extra_data["references"] = mapped_references
            obj.log.info("Extracted {0} references".format(
                len(mapped_references)
            ))
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
Ejemplo n.º 3
0
def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id))

    if not existing_file:
        # We download it
        pdf = get_pdf_for_model(eng, arxiv_id)

        if pdf is None:
            obj.log.error("No pdf found")
            return
        add_file_by_name(model, pdf)
    else:
        pdf = existing_file.get_syspath()

    if pdf and os.path.isfile(pdf):
        mapped_references = extract_references(pdf)
        if mapped_references:
            record["references"] = mapped_references
            obj.log.info("Extracted {0} references".format(
                len(mapped_references)
            ))
            obj.update_task_results(
                "References",
                [{"name": "References",
                  "result": mapped_references,
                  "template": "workflows/results/refextract.html"}]
            )
            model.update()
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")