Exemple #1
0
def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
    """
    Current behavior is to return *both* fatcat refs and GROBID refs if
    available.
    """

    if heavy.doc_type != DocType.work:
        return []

    # first, identify source of refs: fatcat release metadata or GROBID
    assert heavy.biblio_release_ident
    primary_release = [
        r for r in heavy.releases if r.ident == heavy.biblio_release_ident
    ][0]

    refs: List[RefStructured] = []

    if primary_release.refs:
        # TODO: what about other releases?
        refs.extend(refs_from_release_refs(primary_release))

    if heavy.grobid_fulltext:
        fulltext_release = [
            r for r in heavy.releases
            if r.ident == heavy.grobid_fulltext["release_ident"]
        ][0]
        tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
        refs.extend(refs_from_grobid(fulltext_release, tei_dict))

    return refs
Exemple #2
0
def test_grobid_teixml2json() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    obj = teixml2json(blob, True)

    assert (
        obj["title"] ==
        "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"
    )

    ref = [c for c in obj["citations"] if c["id"] == "b12"][0]
    assert ref["authors"][0] == {
        "given_name": "K",
        "name": "K Tasa",
        "surname": "Tasa"
    }
    assert ref["journal"] == "Quality Management in Health Care"
    assert ref["title"] == "Using patient feedback for quality improvement"
    assert ref["date"] == "1996"
    assert ref["pages"] == "206-225"
    assert ref["volume"] == "8"
    assert (
        ref["unstructured"] ==
        "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
    )
def test_transform_refs_grobid() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    dummy_release = ReleaseEntity(
        ident="releasedummy22222222222222",
        work_id="workdummy22222222222222222",
        release_year=1234,
        ext_ids={},
    )

    tei_dict = teixml2json(blob, True)
    refs = refs_from_grobid(dummy_release, tei_dict)

    ref = refs[12].biblio
    assert ref.contrib_raw_names is not None
    assert ref.contrib_raw_names[0] == "K Tasa"
    assert ref.container_name == "Quality Management in Health Care"
    assert ref.title == "Using patient feedback for quality improvement"
    assert ref.year == 1996
    assert ref.pages == "206-225"
    assert ref.volume == "8"
    assert (
        ref.unstructured ==
        "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
    )
Exemple #4
0
def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:

    tags: List[str] = []
    work_ident: Optional[str] = None
    sim_issue: Optional[str] = None
    abstracts: List[ScholarAbstract] = []
    fulltext: Optional[ScholarFulltext] = None
    primary_release: Optional[ReleaseEntity] = None
    exclude_web_fulltext: bool = False

    ia_sim: Optional[ScholarSim] = None
    if heavy.sim_fulltext is not None:
        ia_sim = es_sim_from_sim(heavy.sim_fulltext)
        fulltext = es_fulltext_from_sim(heavy.sim_fulltext)

    if heavy.doc_type == DocType.sim_page:
        assert ia_sim is not None
        assert heavy.sim_fulltext is not None
        if not ia_sim.first_page or not ia_sim.issue_item:
            # can't create a valid key if we don't have these fields, so shouldn't index
            return None
        key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}"
        sim_issue = ia_sim.issue_item
        biblio = es_biblio_from_sim(heavy.sim_fulltext)
        # fulltext extracted from heavy.sim_fulltext above
    elif heavy.doc_type == DocType.work:
        work_ident = heavy.releases[0].work_id
        key = f"work_{work_ident}"
        assert heavy.biblio_release_ident
        primary_release = [
            r for r in heavy.releases if r.ident == heavy.biblio_release_ident
        ][0]
        biblio = es_biblio_from_release(primary_release)
        biblio = biblio_metadata_hacks(biblio)
        exclude_web_fulltext = check_exclude_web(biblio)
        abstracts = es_abstracts_from_release(primary_release)

        # if no abstract from primary_release, try all the other releases
        for release in heavy.releases:
            if not abstracts:
                abstracts = es_abstracts_from_release(release)
    else:
        raise NotImplementedError(f"doc_type: {heavy.doc_type}")

    if heavy.grobid_fulltext:
        fulltext_release = [
            r for r in heavy.releases
            if r.ident == heavy.grobid_fulltext["release_ident"]
        ][0]
        fulltext_file = [
            f for f in fulltext_release.files
            if f.ident == heavy.grobid_fulltext["file_ident"]
        ][0]
        try:
            tei_dict: Optional[dict] = teixml2json(
                heavy.grobid_fulltext["tei_xml"])
        except xml.etree.ElementTree.ParseError:
            tei_dict = None
        if tei_dict:
            if not abstracts:
                abstracts = es_abstracts_from_grobid(tei_dict)
            grobid_fulltext = es_fulltext_from_grobid(tei_dict, heavy.pdf_meta,
                                                      fulltext_release,
                                                      fulltext_file)
            if exclude_web_fulltext and grobid_fulltext:
                if not fulltext:
                    # include only partial fulltext object, with no access
                    fulltext = grobid_fulltext.remove_access()
            else:
                fulltext = grobid_fulltext

    if not fulltext and heavy.pdftotext_fulltext:
        fulltext_release = [
            r for r in heavy.releases
            if r.ident == heavy.pdftotext_fulltext["release_ident"]
        ][0]
        fulltext_file = [
            f for f in fulltext_release.files
            if f.ident == heavy.pdftotext_fulltext["file_ident"]
        ][0]
        pdftotext_fulltext = es_fulltext_from_pdftotext(
            heavy.pdftotext_fulltext["raw_text"],
            heavy.pdf_meta,
            fulltext_release,
            fulltext_file,
        )
        if exclude_web_fulltext and pdftotext_fulltext:
            fulltext = pdftotext_fulltext.remove_access()
        else:
            fulltext = pdftotext_fulltext

    if not fulltext and heavy.html_fulltext:
        fulltext_release = [
            r for r in heavy.releases
            if r.ident == heavy.html_fulltext["release_ident"]
        ][0]
        fulltext_webcapture = [
            f for f in fulltext_release.webcaptures
            if f.ident == heavy.html_fulltext["webcapture_ident"]
        ][0]
        html_fulltext = es_fulltext_from_html(
            heavy.html_fulltext,
            fulltext_release,
            fulltext_webcapture,
        )
        if exclude_web_fulltext and html_fulltext:
            fulltext = html_fulltext.remove_access()
        else:
            fulltext = html_fulltext

    # TODO: additional access list (eg, HTML if only PDF currently)
    access_dict = dict()
    if fulltext and fulltext.access_type:
        access_dict[fulltext.access_type] = ScholarAccess(
            access_type=fulltext.access_type,
            access_url=fulltext.access_url,
            mimetype=fulltext.file_mimetype,
            file_ident=fulltext.file_ident,
            release_ident=fulltext.release_ident,
        )
    if ia_sim and not AccessType.ia_sim in access_dict:
        access_dict[AccessType.ia_sim] = ScholarAccess(
            access_type=AccessType.ia_sim,
            access_url=
            f"https://archive.org/details/{ia_sim.issue_item}/page/{ia_sim.first_page}",
            # TODO: release_ident
        )

    # TODO: additional abstracts (?)

    tags = generate_tags(biblio, primary_release)

    # biorxiv/medrxiv hacks
    if (biblio.doi_prefix == "10.1101"
            and biblio.container_name in (None, "biorxiv/medrxiv")
            and biblio.release_stage != "published"):
        for _, acc in access_dict.items():
            if "://www.medrxiv.org/" in acc.access_url:
                biblio.container_name = "medRxiv"
                if biblio.release_stage == None:
                    biblio.release_stage = "submitted"
            elif "://www.biorxiv.org/" in acc.access_url:
                biblio.container_name = "bioRxiv"
                if biblio.release_stage == None:
                    biblio.release_stage = "submitted"

    return ScholarDoc(
        key=key,
        collapse_key=sim_issue or work_ident,
        doc_type=heavy.doc_type.value,
        doc_index_ts=datetime.datetime.utcnow(),
        work_ident=work_ident,
        tags=tags,
        biblio=biblio,
        fulltext=fulltext,
        ia_sim=ia_sim,
        abstracts=abstracts,
        releases=[es_release_from_release(r) for r in heavy.releases],
        access=list(access_dict.values()),
    )
Exemple #5
0
def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:

    tags: List[str] = []
    work_ident: Optional[str] = None
    sim_issue: Optional[str] = None
    abstracts: List[ScholarAbstract] = []
    fulltext: Optional[ScholarFulltext] = None
    primary_release: Optional[ReleaseEntity] = None

    ia_sim: Optional[ScholarSim] = None
    if heavy.sim_fulltext is not None:
        ia_sim = es_sim_from_sim(heavy.sim_fulltext)
        fulltext = es_fulltext_from_sim(heavy.sim_fulltext)

    if heavy.doc_type == DocType.sim_page:
        assert ia_sim is not None
        assert heavy.sim_fulltext is not None
        if not ia_sim.first_page or not ia_sim.issue_item:
            # can't create a valid key if we don't have these fields, so shouldn't index
            return None
        key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}"
        sim_issue = ia_sim.issue_item
        biblio = es_biblio_from_sim(heavy.sim_fulltext)
        # fulltext extracted from heavy.sim_fulltext above
    elif heavy.doc_type == DocType.work:
        work_ident = heavy.releases[0].work_id
        key = f"work_{work_ident}"
        assert heavy.biblio_release_ident
        primary_release = [
            r for r in heavy.releases if r.ident == heavy.biblio_release_ident
        ][0]
        biblio = es_biblio_from_release(primary_release)
        biblio = biblio_metadata_hacks(biblio)
        abstracts = es_abstracts_from_release(primary_release)

        # if no abstract from primary_release, try all the other releases
        for release in heavy.releases:
            if not abstracts:
                abstracts = es_abstracts_from_release(release)
    else:
        raise NotImplementedError(f"doc_type: {heavy.doc_type}")

    if heavy.grobid_fulltext:
        fulltext_release = [
            r for r in heavy.releases
            if r.ident == heavy.grobid_fulltext["release_ident"]
        ][0]
        fulltext_file = [
            f for f in fulltext_release.files
            if f.ident == heavy.grobid_fulltext["file_ident"]
        ][0]
        tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
        fulltext = es_fulltext_from_grobid(tei_dict, heavy.pdf_meta,
                                           fulltext_release, fulltext_file)
        if not abstracts:
            abstracts = es_abstracts_from_grobid(tei_dict)

    if not fulltext and heavy.pdftotext_fulltext:
        fulltext_release = [
            r for r in heavy.releases
            if r.ident == heavy.pdftotext_fulltext["release_ident"]
        ][0]
        fulltext_file = [
            f for f in fulltext_release.files
            if f.ident == heavy.pdftotext_fulltext["file_ident"]
        ][0]
        fulltext = es_fulltext_from_pdftotext(
            heavy.pdftotext_fulltext["raw_text"],
            heavy.pdf_meta,
            fulltext_release,
            fulltext_file,
        )

    # TODO: additional access list
    access_dict = dict()
    if fulltext and fulltext.access_type:
        access_dict[fulltext.access_type] = ScholarAccess(
            access_type=fulltext.access_type,
            access_url=fulltext.access_url,
            mimetype=fulltext.file_mimetype,
            file_ident=fulltext.file_ident,
            release_ident=fulltext.release_ident,
        )
    if ia_sim and not AccessType.ia_sim in access_dict:
        access_dict[AccessType.ia_sim] = ScholarAccess(
            access_type=AccessType.ia_sim,
            access_url=
            f"https://archive.org/details/{ia_sim.issue_item}/page/{ia_sim.first_page}",
        )

    # TODO: additional abstracts

    # tags
    if biblio.license_slug and biblio.license_slug.lower().startswith("cc-"):
        tags.append("oa")
    if primary_release and primary_release.container:
        container = primary_release.container
        if container.extra:
            if container.extra.get("doaj"):
                tags.append("doaj")
                tags.append("oa")
            if container.extra.get("road"):
                tags.append("road")
                tags.append("oa")
            if container.extra.get("szczepanski"):
                tags.append("szczepanski")
                tags.append("oa")
            if container.extra.get("ia", {}).get("longtail_oa"):
                tags.append("longtail")
                tags.append("oa")
            if container.extra.get("sherpa_romeo", {}).get("color") == "white":
                tags.append("oa")
            if container.extra.get("default_license",
                                   "").lower().startswith("cc-"):
                tags.append("oa")
            if container.extra.get("platform"):
                # scielo, ojs, wordpress, etc
                tags.append(container.extra["platform"].lower())
    if biblio.doi_prefix == "10.2307":
        tags.append("jstor")
    tags = list(set(tags))

    # biorxiv/medrxiv hacks
    if (biblio.doi_prefix == "10.1101"
            and biblio.container_name in (None, "biorxiv/medrxiv")
            and biblio.release_stage != "published"):
        for _, acc in access_dict.items():
            if "://www.medrxiv.org/" in acc.access_url:
                biblio.container_name = "medRxiv"
                if biblio.release_stage == None:
                    biblio.release_stage = "submitted"
            elif "://www.biorxiv.org/" in acc.access_url:
                biblio.container_name = "bioRxiv"
                if biblio.release_stage == None:
                    biblio.release_stage = "submitted"

    return ScholarDoc(
        key=key,
        collapse_key=sim_issue or work_ident,
        doc_type=heavy.doc_type.value,
        doc_index_ts=datetime.datetime.utcnow(),
        work_ident=work_ident,
        tags=tags,
        biblio=biblio,
        fulltext=fulltext,
        ia_sim=ia_sim,
        abstracts=abstracts,
        releases=[es_release_from_release(r) for r in heavy.releases],
        access=list(access_dict.values()),
    )