Example #1
0
def grobid_ref_to_release(ref: dict) -> ReleaseEntity:
    """
    Takes the dict returned by transform_grobid_ref_xml() and returns a partial
    ReleaseEntity object (for use with fuzzycat)
    """
    contribs = []
    for author in ref.get("authors") or []:
        contribs.append(
            ReleaseContrib(
                raw_name=author.get("name"),
                given_name=author.get("given_name"),
                surname=author.get("surname"),
            ))
    release = ReleaseEntity(
        title=ref.get("title"),
        contribs=contribs,
        volume=ref.get("volume"),
        issue=ref.get("issue"),
        pages=ref.get("pages"),
        ext_ids=ReleaseExtIds(
            doi=clean_doi(ref.get("doi")),
            pmid=ref.get("pmid"),
            pmcid=ref.get("pmcid"),
            arxiv=ref.get("arxiv_id"),
        ),
    )
    if ref.get("journal"):
        release.extra = {"container_name": ref.get("journal")}
    if ref.get("date"):
        if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit():
            release.release_year = int(ref["date"][0:4])
        # TODO: try to parse 'date' into an ISO date format, and assign to release_date?
    return release
def ref_to_release(ref: dict) -> ReleaseEntity:
    contribs = []
    for author in ref.get("authors") or []:
        contribs.append(
            ReleaseContrib(
                raw_name=author.get("name"),
                given_name=author.get("given_name"),
                surname=author.get("surname"),
            ))
    release = ReleaseEntity(
        title=ref.get("title"),
        contribs=contribs,
        volume=ref.get("volume"),
        issue=ref.get("issue"),
        pages=ref.get("pages"),
        ext_ids=ReleaseExtIds(
            doi=ref.get("doi"),
            pmid=ref.get("pmid"),
            pmcid=ref.get("pmcid"),
            arxiv=ref.get("arxiv_id"),
        ),
    )
    if ref.get("journal"):
        release.extra = {"container_name": ref.get("journal")}
    if ref.get("date"):
        if len(ref["date"]) == 4 and ref["date"].isdigit():
            release.release_year = int(ref["date"])
    return release
def ref_to_release(ref: GrobidBiblio) -> ReleaseEntity:
    contribs = []
    for author in ref.authors or []:
        contribs.append(
            ReleaseContrib(
                raw_name=author.full_name,
                given_name=author.given_name,
                surname=author.surname,
            )
        )
    release = ReleaseEntity(
        title=ref.title,
        contribs=contribs,
        volume=ref.volume,
        issue=ref.issue,
        pages=ref.pages,
        ext_ids=ReleaseExtIds(
            doi=ref.doi,
            pmid=ref.pmid,
            pmcid=ref.pmcid,
            arxiv=ref.arxiv_id,
        ),
    )
    if ref.journal:
        release.extra = {"container_name": ref.journal}
    if ref.date:
        if len(ref.date) == 4 and ref.date.isdigit():
            release.release_year = int(ref.date)
    return release
Example #4
0
def test_elasticsearch_release_kbart_year():
    this_year = datetime.date.today().year
    r = ReleaseEntity(
        title="something",
        release_year=this_year,
        license_slug="CC-BY-NC",
        ext_ids=ReleaseExtIds(),
        refs=[
            ReleaseRef(),
            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
        ],
    )
    r.state = "active"
    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "kbart": {
                "lockss": {
                    "year_spans": [[1900, this_year - 2]],
                },
            },
        },
    )
    es = release_to_elasticsearch(r)
    assert es["release_year"] == this_year

    assert es["preservation"] == "none"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is False
    assert es["in_web"] is False
    assert es["in_dweb"] is False
    assert es["in_ia"] is False
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is False
    assert es["in_jstor"] is False

    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "kbart": {
                "lockss": {
                    "year_spans": [[1900, this_year - 1]],
                },
            },
        },
    )
    es = release_to_elasticsearch(r)
    assert es["release_year"] == this_year

    assert es["preservation"] == "dark"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is False
    assert es["in_dweb"] is False
    assert es["in_ia"] is False
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is True
    assert es["in_jstor"] is False
def test_transform_refs_grobid() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    dummy_release = ReleaseEntity(
        ident="releasedummy22222222222222",
        work_id="workdummy22222222222222222",
        release_year=1234,
        ext_ids={},
    )

    tei_dict = teixml2json(blob, True)
    refs = refs_from_grobid(dummy_release, tei_dict)

    ref = refs[12].biblio
    assert ref.contrib_raw_names is not None
    assert ref.contrib_raw_names[0] == "K Tasa"
    assert ref.container_name == "Quality Management in Health Care"
    assert ref.title == "Using patient feedback for quality improvement"
    assert ref.year == 1996
    assert ref.pages == "206-225"
    assert ref.volume == "8"
    assert (
        ref.unstructured ==
        "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
    )
Example #6
0
def enrich_release_from_crossref(release: ReleaseEntity,
                                 record: Dict[str, Any]) -> ReleaseEntity:
    """
    Hack to copy some SIM-relevant fields from Crossref record to release entity.

    We should really update fatcat catalog itself with these fields, instead of
    doing the update here in the scholar pipeline, but that is a more delicate
    update, and we expect this to help make SIM matches faster (late 2021/early
    2022).
    """
    if release.volume is None and record.get("volume"):
        release.volume = clean_str(record["volume"])
    if release.issue is None and record.get("issue"):
        release.issue = clean_str(record["issue"])
    if release.pages is None and record.get("pages"):
        release.pages = clean_str(record["page"])
    return release
Example #7
0
    def update_entity(self, re: ReleaseEntity) -> None:
        """
        Mutates a release entity in place, updating fields with values from
        this form.

        Form must be validated *before* calling this function.
        """
        for simple_attr in RELEASE_SIMPLE_ATTRS:
            a = getattr(self, simple_attr).data
            # special case blank strings
            if a == "":
                a = None
            setattr(re, simple_attr, a)
        for extid_attr in RELEASE_EXTID_ATTRS:
            a = getattr(self, extid_attr).data
            # special case blank strings
            if a == "":
                a = None
            setattr(re.ext_ids, extid_attr, a)
        if self.release_date.data:
            re.release_year = self.release_date.data.year
        # bunch of complexity here to preserve old contrib metadata (eg,
        # affiliation and extra) not included in current forms
        # TODO: this may be broken; either way needs tests
        if re.contribs:
            old_contribs = re.contribs.copy()
            re.contribs = []
        else:
            old_contribs = []
            re.contribs = []
        for c in self.contribs:
            if c.prev_index.data not in ("", None):
                rc = old_contribs[int(c.prev_index.data)]
                rc.role = c.role.data or None
                rc.raw_name = c.raw_name.data or None
            else:
                rc = ReleaseContrib(
                    role=c.role.data or None,
                    raw_name=c.raw_name.data or None,
                )
            re.contribs.append(rc)
        if self.edit_description.data:
            re.edit_extra = dict(description=self.edit_description.data)
Example #8
0
    def parse_record(self, row: str) -> ReleaseEntity:

        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode is False

        ident = row.strip().split()[0]
        assert len(ident) == 26

        return ReleaseEntity(
            ident=ident,
            ext_ids=ReleaseExtIds(),
        )
def test_transform_refs_crossref() -> None:

    with open("tests/files/example_crossref_record.json", "r") as f:
        record = json.loads(f.read())

    dummy_release = ReleaseEntity(
        ident="releasedummy22222222222222",
        work_id="workdummy22222222222222222",
        release_year=1234,
        release_stage="accepted",
        ext_ids={},
    )

    refs = refs_from_crossref(dummy_release, record)

    assert refs[0].release_ident == "releasedummy22222222222222"
    assert refs[0].work_ident == "workdummy22222222222222222"
    assert refs[0].release_stage == "accepted"
    assert refs[0].release_year == 1234
    assert refs[0].ref_source == "crossref"
    assert refs[0].key == "BIB0001|his12200-cit-0001"
    assert refs[0].index == 1
    assert refs[0].locator is None
    assert refs[0].biblio.contrib_raw_names is not None
    assert refs[0].biblio.contrib_raw_names[0] == "Churg"
    assert refs[0].biblio.container_name == "Arch. Pathol. Lab. Med."
    assert (
        refs[0].biblio.title
        == "The separation of benign and malignant mesothelial proliferations"
    )
    assert refs[0].biblio.year == 2012
    assert refs[0].biblio.pages == "1217"
    assert refs[0].biblio.volume == "136"
    assert refs[0].biblio.doi == "10.5858/arpa.2012-0112-ra"
    assert refs[0].biblio.unstructured is None

    assert (
        refs[6].biblio.title
        == "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference"
    )
    assert refs[6].biblio.year == 2001

    assert refs[7].key == "CIT0041"
    assert (
        refs[7].biblio.unstructured
        == "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6."
    )

    assert refs[8].key == "576_CR3"
    assert refs[8].biblio.unstructured is not None
    assert refs[8].biblio.title == "The NURBS Book, Monographs in Visual Communication"
    assert refs[8].biblio.year == 1997
    assert refs[8].biblio.version == "2"
Example #10
0
def test_fuzzy_match_different(entity_importer, mocker) -> None:
    """
    Simple fuzzycat-mocked test for "strong match" case
    """

    r1 = ReleaseEntity(
        title="example title: novel work",
        contribs=[ReleaseContrib(raw_name="robin hood")],
        ext_ids=ReleaseExtIds(doi="10.1234/abcdefg"),
    )
    r2 = ReleaseEntity(
        title="Example Title: Novel Work?",
        contribs=[ReleaseContrib(raw_name="robin hood")],
        ext_ids=ReleaseExtIds(),
    )
    r3 = ReleaseEntity(
        title="entirely different",
        contribs=[ReleaseContrib(raw_name="king tut")],
        ext_ids=ReleaseExtIds(),
    )

    match_raw = mocker.patch(
        'fatcat_tools.importers.common.match_release_fuzzy')
    match_raw.side_effect = [[r3, r2, r3, r2]]
    resp = entity_importer.match_existing_release_fuzzy(r1)
    assert (resp[0], resp[2]) == ("STRONG", r2)

    match_raw.side_effect = [[r2, r2, r3, r1]]
    resp = entity_importer.match_existing_release_fuzzy(r1)
    assert (resp[0], resp[2]) == ("EXACT", r1)

    match_raw.side_effect = [[r3]]
    resp = entity_importer.match_existing_release_fuzzy(r1)
    assert resp == None

    match_raw.side_effect = [[]]
    resp = entity_importer.match_existing_release_fuzzy(r1)
    assert resp == None
Example #11
0
def biblio_to_release(biblio: dict) -> ReleaseEntity:
    """
    Helper for close_fuzzy_biblio_matches() et al
    """
    contribs = []
    if biblio.get('authors'):
        for a in biblio['authors']:
            contribs.append(
                ReleaseContrib(
                    raw_name=a.get('name'),
                    given_name=a.get('given_name'),
                    surname=a.get('surname'),
                ))
    elif biblio.get('author_names'):
        for a in biblio['author_names']:
            contribs.append(ReleaseContrib(raw_name=a))
    elif biblio.get('first_author'):
        contribs.append(ReleaseContrib(raw_name=biblio['first_author']))
    release = ReleaseEntity(
        title=biblio.get("title"),
        ext_ids=ReleaseExtIds(
            doi=clean_doi(biblio.get("doi")),
            pmid=biblio.get("pmid"),
            pmcid=biblio.get("pmcid"),
            arxiv=biblio.get("arxiv_id"),
        ),
        volume=biblio.get("volume"),
        issue=biblio.get("issue"),
        pages=biblio.get("pages") or biblio.get("first_page"),
        publisher=biblio.get("publisher"),
        release_stage=biblio.get("release_stage"),
        release_type=biblio.get("release_type"),
        extra=dict(),
    )
    if biblio.get('journal'):
        release.extra['container_name'] = biblio['journal']
    elif biblio.get('conference'):
        release.extra['container_name'] = biblio['conference']
    if biblio.get('year'):
        year = biblio['year']
        if isinstance(year, str) and len(year) >= 4 and year[0:4].isdigit():
            release.release_year = int(year[0:4])
        elif isinstance(year, int):
            release.release_year = year
    elif biblio.get('date'):
        date = biblio['date']
        if isinstance(date, str) and len(date) >= 4 and date[0:4].isdigit():
            release.release_year = int(date[0:4])
    return release
Example #12
0
def test_fuzzy_match_none(entity_importer, mocker) -> None:
    """
    Simple ES-mocked test for "no search results" case
    """

    es_raw = mocker.patch(
        'elasticsearch.connection.Urllib3HttpConnection.perform_request')
    es_raw.side_effect = [
        (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)),
        (200, {}, json.dumps(ES_RELEASE_EMPTY_RESP)),
    ]

    release = ReleaseEntity(
        title=
        "some long title which should not match anything because it is for testing",
        ext_ids=ReleaseExtIds(),
    )

    resp = entity_importer.match_existing_release_fuzzy(release)
    assert resp == None
Example #13
0
def generic_deleted_entity(entity_type: str, ident: str) -> Any:
    if entity_type == "container":
        entity: Any = ContainerEntity()
    elif entity_type == "creator":
        entity = CreatorEntity()
    elif entity_type == "file":
        entity = FileEntity()
    elif entity_type == "fileset":
        entity = FilesetEntity()
    elif entity_type == "webcapture":
        entity = WebcaptureEntity()
    elif entity_type == "release":
        entity = ReleaseEntity(ext_ids=ReleaseExtIds())
    elif entity_type == "work":
        entity = WorkEntity()
    else:
        raise NotImplementedError
    entity.ident = ident
    entity.state = "deleted"
    return entity
def test_transform_refs_grobid() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    dummy_release = ReleaseEntity(
        ident="releasedummy22222222222222",
        work_id="workdummy22222222222222222",
        release_year=1234,
        release_stage="accepted",
        ext_ids={},
    )

    tei_doc = parse_document_xml(blob)
    refs = refs_from_grobid(dummy_release, tei_doc)

    ref = refs[12]
    assert ref.release_ident == "releasedummy22222222222222"
    assert ref.work_ident == "workdummy22222222222222222"
    assert ref.release_stage == "accepted"
    assert ref.release_year == 1234
    assert ref.ref_source == "grobid"
    assert ref.key == "b12"
    assert ref.index == 13
    assert ref.locator is None
    assert ref.biblio.contrib_raw_names is not None
    assert ref.biblio.contrib_raw_names[0] == "K Tasa"
    assert ref.biblio.container_name == "Quality Management in Health Care"
    assert ref.biblio.title == "Using patient feedback for quality improvement"
    assert ref.biblio.year == 1996
    assert ref.biblio.pages == "206-225"
    assert ref.biblio.volume == "8"
    assert (
        ref.biblio.unstructured
        == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
    )
Example #15
0
 def to_entity(self) -> ReleaseEntity:
     assert self.title.data
     entity = ReleaseEntity(title=self.title.data, ext_ids=ReleaseExtIds())
     self.update_entity(entity)
     return entity
Example #16
0
def test_rich_elasticsearch_convert():
    r = ReleaseEntity(
        title="something",
        release_year=1234,
        license_slug="CC-BY-NC",
        ext_ids=ReleaseExtIds(),
        refs=[
            ReleaseRef(),
            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
        ],
    )
    r.state = "active"
    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "ia": {
                "sim": {
                    "year_spans": [[1000, 1100]],
                },
            },
            "kbart": {
                "lockss": {
                    "year_spans": [[1200, 1300]],
                },
                "jstor": {
                    "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]],
                },
            },
            "sherpa_romeo": {
                "color": "blue"
            },
            "doaj": {
                "as_of": "2010-02-03"
            },
        },
    )
    r.files = [
        FileEntity(
            mimetype="application/pdf",
            urls=[
                FileUrl(rel="dweb", url="dat://a954329dlk/thingie"),
                FileUrl(
                    rel="webarchive",
                    url=
                    "https://web.archive.org/web/20001122030405/http://example.com",
                ),
                FileUrl(rel="web",
                        url="https://archive.org/details/blah/file.pdf"),
            ],
            extra={
                "shadows": {},
            },
        )
    ]
    es = release_to_elasticsearch(r)
    assert es["release_year"] == r.release_year
    assert es["file_count"] == 1
    assert es["fileset_count"] == 0
    assert es["webcapture_count"] == 0
    assert es["ref_count"] == 2
    assert es["ref_linked_count"] == 1

    assert es["preservation"] == "bright"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is True
    assert es["in_dweb"] is True
    assert es["in_ia"] is True
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is True
    assert es["in_jstor"] is True
Example #17
0
    def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity:
        """
        This function handles known special cases. For example,
        publisher-specific or platform-specific workarounds.
        """

        # only runs on datacite entities with a DOI
        assert re.ext_ids.doi

        # release_type exception: Global Biodiversity Information Facility
        # publishes highly interesting datasets, but titles are mostly the same
        # ("GBIF Occurrence Download" or "Occurrence Download"); set
        # release_type to "stub" (CSL/FC).
        if re.title == "GBIF Occurrence Download" and re.ext_ids.doi.startswith(
                "10.15468/dl."):
            re.release_type = "stub"

        # release_type exception: lots of "Experimental Crystal Structure Determination"
        # publisher: "Cambridge Crystallographic Data Centre"
        if re.ext_ids.doi.startswith("10.5517/"):
            re.release_type = "entry"

        # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
        if re.title.lower().startswith(
                "additional file") and re.release_type in (
                    "article",
                    "article-journal",
                ):
            re.release_type = "component"

        # figshare
        if re.ext_ids.doi.startswith("10.6084/") or re.ext_ids.doi.startswith(
                "10.25384"):
            # set version if DOI ends with versioned suffix
            doi_suffix = re.ext_ids.doi.split(".")[-1]
            if doi_suffix and doi_suffix.startswith(
                    "v") and doi_suffix[1:].isdigit():
                re.version = doi_suffix
            # "Figure 123 from " -> component
            # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean"
            if " from " in re.title and re.release_type not in ("stub",
                                                                "graphic"):
                if re.title.startswith("Figure "):
                    re.release_type = "component"
                elif re.title.startswith("Table "):
                    re.release_type = "component"

        # figshare.com
        if (re.ext_ids.doi.startswith("10.6084/m9.figshare.")
                and re.extra.get("container_name") is None):
            re.extra["container_name"] = "figshare.com"

        # Columbia Institutional Repository includes full bibliographic
        # metadata, which results in incorrect container_id matches. But this
        # DOI prefix also publishes actual journals!
        if (re.ext_ids.doi.startswith("10.7916/") and "-" in re.ext_ids.doi
                and re.publisher == "Columbia University" and re.extra
                and re.extra.get("datacite")):
            for relation in re.extra["datacite"].get("relations", []):
                if relation.get("relationType") == "IsVariantFormOf":
                    re.container_id = None
                    if re.release_stage in ("published", None):
                        re.release_stage = "submitted"

        # several institutional and other repositories (including "RWTH" and
        # "DESY") also results in incorrect container_id matches.
        # This probably doesn't filter out enough, but is a start.
        IR_DOI_PREFIXES = [
            "10.15495/epub_ubt_",
            "10.18154/rwth-20",
            "10.3204/pubdb-",
            "10.3204/phppubdb-",
            "10.26204/kluedo/",
        ]
        for prefix in IR_DOI_PREFIXES and re.extra and re.extra.get(
                "datacite"):
            if re.ext_ids.doi.startswith(prefix):
                for relation in re.extra["datacite"].get("relations", []):
                    if relation.get("relationType") == "IsVariantFormOf":
                        re.container_id = None

        return re
Example #18
0
    def parse_record(self, record: Any) -> Optional[List[ReleaseEntity]]:

        if not record:
            return None
        metadata = record.arXivRaw
        if not metadata:
            return None
        extra: Dict[str, Any] = dict()
        extra_arxiv: Dict[str, Any] = dict()

        # don't know!
        release_type = "article"

        base_id = metadata.id.string
        doi = None
        if metadata.doi and metadata.doi.string:
            doi = clean_doi(metadata.doi.string.lower().split()[0].strip())
            if doi and not (doi.startswith("10.") and "/" in doi
                            and doi.split("/")[1]):
                sys.stderr.write("BOGUS DOI: {}\n".format(doi))
                doi = None
        title = latex_to_text(metadata.title.get_text().replace("\n", " "))
        authors = parse_arxiv_authors(metadata.authors.get_text().replace(
            "\n", " "))
        contribs = [
            fatcat_openapi_client.ReleaseContrib(index=i,
                                                 raw_name=a,
                                                 role="author")
            for i, a in enumerate(authors)
        ]

        lang: Optional[str] = "en"  # the vast majority in english
        if metadata.comments and metadata.comments.get_text():
            comments = metadata.comments.get_text().replace("\n", " ").strip()
            extra_arxiv["comments"] = comments
            if "in french" in comments.lower():
                lang = "fr"
            elif "in spanish" in comments.lower():
                lang = "es"
            elif "in portuguese" in comments.lower():
                lang = "pt"
            elif "in hindi" in comments.lower():
                lang = "hi"
            elif "in japanese" in comments.lower():
                lang = "ja"
            elif "in german" in comments.lower():
                lang = "de"
            elif "simplified chinese" in comments.lower():
                lang = "zh"
            elif "in russian" in comments.lower():
                lang = "ru"
            # more languages?

        number = None
        if metadata.find("journal-ref") and metadata.find(
                "journal-ref").get_text():
            journal_ref = metadata.find("journal-ref").get_text().replace(
                "\n", " ").strip()
            extra_arxiv["journal_ref"] = journal_ref
            if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(
            ):
                release_type = "paper-conference"
        if metadata.find("report-no") and metadata.find("report-no").string:
            number = metadata.find("report-no").string.strip()
            # at least some people plop extra metadata in here. hrmf!
            if "ISSN " in number or "ISBN " in number or len(
                    number.split()) > 2:
                extra_arxiv["report-no"] = number
                number = None
            else:
                release_type = "report"
        if metadata.find("acm-class") and metadata.find("acm-class").string:
            extra_arxiv["acm_class"] = metadata.find(
                "acm-class").string.strip()
        if metadata.categories and metadata.categories.get_text():
            extra_arxiv["categories"] = metadata.categories.get_text().split()
        license_slug = None
        if metadata.license and metadata.license.get_text():
            license_slug = lookup_license_slug(metadata.license.get_text())
        abstracts = None
        if metadata.abstract:
            # TODO: test for this multi-abstract code path
            abstracts = []
            abst = metadata.abstract.get_text().strip()
            orig = None
            if "-----" in abst:
                both = abst.split("-----")
                abst = both[0].strip()
                orig = both[1].strip()
            if "$" in abst or "{" in abst:
                mime = "application/x-latex"
                abst_plain = latex_to_text(abst)
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(
                        content=abst_plain, mimetype="text/plain", lang="en"))
            else:
                mime = "text/plain"
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(content=abst,
                                                      mimetype=mime,
                                                      lang="en"))
            if orig:
                abstracts.append(
                    fatcat_openapi_client.ReleaseAbstract(content=orig,
                                                          mimetype=mime))
                # indicates that fulltext probably isn't english either
                if lang == "en":
                    lang = None

        # extra:
        #   withdrawn_date
        #   translation_of
        #   subtitle
        #   aliases
        #   container_name
        #   group-title
        #   arxiv: comments, categories, etc
        extra_arxiv["base_id"] = base_id
        extra["superceded"] = True
        extra["arxiv"] = extra_arxiv

        versions = []
        for version in metadata.find_all("version"):
            arxiv_id = base_id + version["version"]
            release_date = version.date.string.strip()
            release_date = datetime.datetime.strptime(
                release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
            # TODO: source_type?
            versions.append(
                ReleaseEntity(
                    work_id=None,
                    title=title,
                    # original_title
                    version=version["version"],
                    release_type=release_type,
                    release_stage="submitted",
                    release_date=release_date.isoformat(),
                    release_year=release_date.year,
                    ext_ids=fatcat_openapi_client.ReleaseExtIds(
                        arxiv=arxiv_id, ),
                    number=number,
                    language=lang,
                    license_slug=license_slug,
                    abstracts=abstracts,
                    contribs=contribs,
                    extra=extra.copy(),
                ))
        # TODO: assert that versions are actually in order?
        assert versions

        versions[-1].extra.pop("superceded")

        # only apply DOI to most recent version (HACK)
        if doi:
            versions[-1].ext_ids.doi = doi
            if len(versions) > 1:
                versions[-1].release_stage = "accepted"
        return versions
Example #19
0
def enrich_release_entity(entity: ReleaseEntity) -> ReleaseEntity:
    if entity.state in ("redirect", "deleted"):
        return entity
    if entity.state == "active":
        entity._es = release_to_elasticsearch(entity, force_bool=False)
    if entity.container and entity.container.state == "active":
        entity.container._es = container_to_elasticsearch(entity.container,
                                                          force_bool=False)
    if entity.files:
        # remove shadows-only files with no URLs
        entity.files = [
            f for f in entity.files
            if not (f.extra and f.extra.get("shadows") and not f.urls)
        ]
    if entity.filesets:
        for fs in entity.filesets:
            fs._total_size = sum([f.size for f in fs.manifest])
    if entity.webcaptures:
        for wc in entity.webcaptures:
            wc._wayback_suffix = wayback_suffix(wc)
    for ref in entity.refs:
        # this is a UI hack to get rid of XML crud in unstructured refs like:
        # LOCKSS (2014) Available: <ext-link
        # xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri"
        # xlink:href="http://lockss.org/"
        # xlink:type="simple">http://lockss.org/</ext-link>. Accessed: 2014
        # November 1.
        if ref.extra and ref.extra.get("unstructured"):
            ref.extra["unstructured"] = strip_extlink_xml(
                ref.extra["unstructured"])
    # for backwards compatibility, copy extra['subtitle'] to subtitle
    if not entity.subtitle and entity.extra and entity.extra.get("subtitle"):
        if isinstance(entity.extra["subtitle"], str):
            entity.subtitle = entity.extra["subtitle"]
        elif isinstance(entity.extra["subtitle"], list):
            entity.subtitle = entity.extra["subtitle"][0] or None
    # author list to display; ensure it's sorted by index (any othors with
    # index=None go to end of list)
    authors = [
        c for c in entity.contribs if c.role in ("author", None) and (
            c.surname or c.raw_name or (c.creator and c.creator.surname))
    ]
    entity._authors = sorted(authors,
                             key=lambda c:
                             (c.index is None and 99999999) or c.index)
    # need authors, title for citeproc to work
    entity._can_citeproc = bool(entity._authors) and bool(entity.title)
    if entity.abstracts and entity.abstracts[0].mimetype:
        # hack to show plain text instead of latex abstracts
        if "latex" in entity.abstracts[0].mimetype:
            entity.abstracts.reverse()
        # hack to (partially) clean up common JATS abstract display case
        if entity.abstracts[0].mimetype == "application/xml+jats":
            for tag in ("p", "jats", "jats:p"):
                entity.abstracts[0].content = entity.abstracts[
                    0].content.replace("<{}>".format(tag), "")
                entity.abstracts[0].content = entity.abstracts[
                    0].content.replace("</{}>".format(tag), "")
                # ugh, double encoding happens
                entity.abstracts[0].content = entity.abstracts[
                    0].content.replace("&lt;/{}&gt;".format(tag), "")
                entity.abstracts[0].content = entity.abstracts[
                    0].content.replace("&lt;{}&gt;".format(tag), "")
    return entity
Example #20
0
    def try_update(self, re: ReleaseEntity) -> bool:

        # first, lookup existing by PMID (which must be defined)
        existing = None
        try:
            existing = self.api.lookup_release(pmid=re.ext_ids.pmid)
        except fatcat_openapi_client.rest.ApiException as err:
            if err.status != 404:
                raise err

        # then try DOI lookup if there is one
        if not existing and re.ext_ids.doi:
            try:
                existing = self.api.lookup_release(doi=re.ext_ids.doi)
            except fatcat_openapi_client.rest.ApiException as err:
                if err.status != 404:
                    raise err
            if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
                warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format(
                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid
                )
                warnings.warn(warn_str)
                self.counts["warn-pmid-doi-mismatch"] += 1
                # don't clobber DOI, but do group together
                re.ext_ids.doi = None
                re.work_id = existing.work_id

        if existing and not self.do_updates:
            self.counts["exists"] += 1
            return False

        if (
            existing
            and existing.ext_ids.pmid
            and (existing.ext_ids.pmcid or not re.ext_ids.pmcid)
            and (existing.refs or not re.refs)
        ):
            # TODO: any other reasons to do an update?
            # don't update if it already has PMID
            self.counts["exists"] += 1
            return False
        elif existing:
            # but do update if only DOI was set
            existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi
            existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid
            existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid

            existing.container_id = existing.container_id or re.container_id
            existing.refs = existing.refs or re.refs
            existing.abstracts = existing.abstracts or re.abstracts
            existing.extra["pubmed"] = re.extra["pubmed"]

            # fix stub titles
            if existing.title in [
                "OUP accepted manuscript",
            ]:
                existing.title = re.title

            existing.original_title = existing.original_title or re.original_title
            existing.release_type = existing.release_type or re.release_type
            existing.release_stage = existing.release_stage or re.release_stage
            existing.release_date = existing.release_date or re.release_date
            existing.release_year = existing.release_year or re.release_year
            existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
            existing.volume = existing.volume or re.volume
            existing.issue = existing.issue or re.issue
            existing.pages = existing.pages or re.pages
            existing.language = existing.language or re.language

            # update subtitle in-place first
            if not existing.subtitle and existing.extra.get("subtitle"):
                subtitle = existing.extra.pop("subtitle")
                if type(subtitle) == list:
                    subtitle = subtitle[0]
                if subtitle:
                    existing.subtitle = subtitle
            if not existing.subtitle:
                existing.subtitle = re.subtitle

            try:
                self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
                self.counts["update"] += 1
            except fatcat_openapi_client.rest.ApiException as err:
                # there is a code path where we try to update the same release
                # twice in a row; if that happens, just skip
                # NOTE: API behavior might change in the future?
                if "release_edit_editgroup_id_ident_id_key" in err.body:
                    self.counts["skip-update-conflict"] += 1
                    return False
                else:
                    raise err
            finally:
                return False

        return True
Example #21
0
    def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
        """
        obj is a python dict (parsed from json).
        returns a ReleaseEntity
        """

        # Ways to be out of scope (provisionally)
        # journal-issue and journal-volume map to None, but allowed for now
        if obj.get("type") in (
                None,
                "journal",
                "proceedings",
                "standard-series",
                "report-series",
                "book-series",
                "book-set",
                "book-track",
                "proceedings-series",
        ):
            self.counts["skip-release-type"] += 1
            return None

        # Do require the 'title' keys to exist, as release entities do
        if ("title" not in obj) or (not obj["title"]):
            self.counts["skip-blank-title"] += 1
            return None

        release_type = self.map_release_type(obj["type"])

        # contribs
        def do_contribs(obj_list: List[Dict[str, Any]],
                        ctype: str) -> List[ReleaseContrib]:
            contribs = []
            for i, am in enumerate(obj_list):
                creator_id = None
                if "ORCID" in am.keys():
                    creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1])
                # Sorry humans :(
                if am.get("given") and am.get("family"):
                    raw_name: Optional[str] = "{} {}".format(
                        am["given"], am["family"])
                elif am.get("family"):
                    raw_name = am["family"]
                else:
                    # TODO: can end up empty
                    raw_name = am.get("name") or am.get("given")
                extra: Dict[str, Any] = dict()
                if ctype == "author":
                    index: Optional[int] = i
                else:
                    index = None
                raw_affiliation = None
                affiliation_list = am.get("affiliation") or []
                # TODO: currently requiring a "name" in all affiliations. Could
                # add ROR support (via identifier) in the near future
                affiliation_list = [a for a in affiliation_list if "name" in a]
                if affiliation_list and len(affiliation_list) > 0:
                    raw_affiliation = affiliation_list[0]["name"]
                    if len(affiliation_list) > 1:
                        # note: affiliation => more_affiliations
                        extra["more_affiliations"] = [
                            clean_str(a["name"]) for a in affiliation_list[1:]
                        ]
                if am.get("sequence") and am.get("sequence") != "additional":
                    extra["seq"] = clean_str(am.get("sequence"))
                assert ctype in ("author", "editor", "translator")
                raw_name = clean_str(raw_name)
                # TODO: what if 'raw_name' is None?
                contribs.append(
                    ReleaseContrib(
                        creator_id=creator_id,
                        index=index,
                        raw_name=raw_name,
                        given_name=clean_str(am.get("given")),
                        surname=clean_str(am.get("family")),
                        raw_affiliation=clean_str(raw_affiliation),
                        role=ctype,
                        extra=extra or None,
                    ))
            return contribs

        contribs = do_contribs(obj.get("author", []), "author")
        contribs.extend(do_contribs(obj.get("editor", []), "editor"))
        contribs.extend(do_contribs(obj.get("translator", []), "translator"))

        # container
        issn = obj.get("ISSN", [None])[0]
        issnl = self.issn2issnl(issn)
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)
        publisher = clean_str(obj.get("publisher"))

        container_name = obj.get("container-title")
        if container_name:
            container_name = clean_str(container_name[0], force_xml=True)
        if not container_name:
            container_name = None
        if (container_id is None and self.create_containers
                and (issnl is not None) and container_name):
            ce = fatcat_openapi_client.ContainerEntity(
                issnl=issnl,
                publisher=publisher,
                container_type=self.map_container_type(release_type),
                name=container_name,
            )
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            self._issnl_id_map[issnl] = container_id

        # license slug
        license_slug = None
        license_extra = []
        for lic in obj.get("license", []):
            if lic["content-version"] not in ("vor", "unspecified"):
                continue
            slug = lookup_license_slug(lic["URL"])
            if slug:
                license_slug = slug
            if "start" in lic:
                lic["start"] = lic["start"]["date-time"]
            license_extra.append(lic)

        # references
        refs = []
        for i, rm in enumerate(obj.get("reference", [])):
            try:
                year: Optional[int] = int(rm.get("year"))
                # TODO: will need to update/config in the future!
                # NOTE: are there crossref works with year < 100?
                if year is not None:
                    if year > 2025 or year < 100:
                        year = None
            except (TypeError, ValueError):
                year = None
            ref_extra: Dict[str, Any] = dict()
            key = rm.get("key")
            if key and key.startswith(obj["DOI"].upper()):
                key = key.replace(obj["DOI"].upper() + "-", "")
                key = key.replace(obj["DOI"].upper(), "")
            ref_container_name = rm.get("volume-title")
            if not ref_container_name:
                ref_container_name = rm.get("journal-title")
            elif rm.get("journal-title"):
                ref_extra["journal-title"] = rm["journal-title"]
            if rm.get("DOI"):
                ref_extra["doi"] = rm.get("DOI").lower()
            author = clean_str(rm.get("author"))
            if author:
                ref_extra["authors"] = [author]
            for k in (
                    "editor",
                    "edition",
                    "authority",
                    "version",
                    "genre",
                    "url",
                    "event",
                    "issue",
                    "volume",
                    "date",
                    "accessed_date",
                    "issued",
                    "page",
                    "medium",
                    "collection_title",
                    "chapter_number",
                    "unstructured",
                    "series-title",
                    "volume-title",
            ):
                if clean_str(rm.get(k)):
                    ref_extra[k] = clean_str(rm[k])
            refs.append(
                fatcat_openapi_client.ReleaseRef(
                    index=i,
                    # doing lookups would be a second import pass
                    target_release_id=None,
                    key=key,
                    year=year,
                    container_name=clean_str(ref_container_name),
                    title=clean_str(rm.get("article-title")),
                    locator=clean_str(rm.get("first-page")),
                    # TODO: just dump JSON somewhere here?
                    extra=ref_extra or None,
                ))

        # abstracts
        abstracts = []
        abstract = clean_str(obj.get("abstract"))
        if abstract and len(abstract) > 10:
            abstracts.append(
                fatcat_openapi_client.ReleaseAbstract(
                    mimetype="application/xml+jats", content=abstract))

        # extra fields
        extra: Dict[str, Any] = dict()
        extra_crossref: Dict[str, Any] = dict()
        # top-level extra keys
        if not container_id:
            if obj.get("container-title"):
                extra["container_name"] = container_name
        for key in "group-title":
            val = obj.get(key)
            if val:
                if type(val) == list:
                    val = val[0]
                if type(val) == str:
                    val = clean_str(val)
                    if val:
                        extra[key] = clean_str(val)
                else:
                    extra[key] = val
        # crossref-nested extra keys
        for key in ("subject", "type", "alternative-id", "archive", "funder"):
            val = obj.get(key)
            if val:
                if type(val) == str:
                    extra_crossref[key] = clean_str(val)
                else:
                    extra_crossref[key] = val
        if license_extra:
            extra_crossref["license"] = license_extra

        if len(obj["title"]) > 1:
            aliases = [clean_str(t) for t in obj["title"][1:]]
            aliases = [t for t in aliases if t]
            if aliases:
                extra["aliases"] = aliases

        # ISBN
        isbn13 = None
        for raw in obj.get("ISBN", []):
            # TODO: convert if not ISBN-13 format
            if len(raw) == 17:
                isbn13 = raw
                break

        # release status
        if obj["type"] in (
                "journal-article",
                "conference-proceeding",
                "book",
                "dissertation",
                "book-chapter",
        ):
            release_stage: Optional[str] = "published"
        else:
            # unknown
            release_stage = None

        # filter out unreasonably huge releases
        if len(abstracts) > 100:
            self.counts["skip-huge-abstracts"] += 1
            return None
        if len(contribs) > 2000:
            self.counts["skip-huge-contribs"] += 1
            return None
        if len(refs) > 5000:
            self.counts["skip-huge-refs"] += 1
            return None

        # release date parsing is amazingly complex
        raw_date = obj["issued"]["date-parts"][0]
        if not raw_date or not raw_date[0]:
            # got some NoneType, even though at least year is supposed to be set
            release_year = None
            release_date = None
        elif len(raw_date) == 3:
            release_year = raw_date[0]
            release_date = datetime.date(year=raw_date[0],
                                         month=raw_date[1],
                                         day=raw_date[2])
        else:
            # sometimes only the year is included, not the full date
            release_year = raw_date[0]
            release_date = None

        original_title: Optional[str] = None
        if obj.get("original-title"):
            ot = obj.get("original-title")
            if ot is not None:
                original_title = clean_str(ot[0], force_xml=True)

        title: Optional[str] = None
        if obj.get("title"):
            title = clean_str(obj["title"][0], force_xml=True)
            if not title or len(title) <= 1:
                # title can't be just a single character
                self.counts["skip-blank-title"] += 1
                return None

        doi = clean_doi(obj["DOI"].lower())
        if not doi:
            self.counts["skip-bad-doi"] += 1
            return None

        subtitle = None
        if obj.get("subtitle"):
            subtitle = clean_str(obj["subtitle"][0], force_xml=True)
            if not subtitle or len(subtitle) <= 1:
                # subtitle can't be just a single character
                subtitle = None

        if extra_crossref:
            extra["crossref"] = extra_crossref

        re = ReleaseEntity(
            work_id=None,
            container_id=container_id,
            title=title,
            subtitle=subtitle,
            original_title=original_title,
            release_type=release_type,
            release_stage=release_stage,
            release_date=release_date,
            release_year=release_year,
            publisher=publisher,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                isbn13=isbn13,
            ),
            volume=clean_str(obj.get("volume")),
            issue=clean_str(obj.get("issue")),
            pages=clean_str(obj.get("page")),
            language=clean_str(obj.get("language")),
            license_slug=license_slug,
            extra=extra or None,
            abstracts=abstracts or None,
            contribs=contribs or None,
            refs=refs or None,
        )
        return re
Example #22
0
    def parse_record(self, record: Any) -> Optional[ReleaseEntity]:
        """
        record is a beautiful soup object
        returns a ReleaseEntity, or None

        In JALC metadata, both English and Japanese records are given for most
        fields.
        """

        extra: Dict[str, Any] = dict()
        extra_jalc: Dict[str, Any] = dict()

        titles = record.find_all("title")
        if not titles:
            return None
        title = titles[0].get_text().replace("\n", " ").strip()
        original_title = None
        if title.endswith("."):
            title = title[:-1]
        if len(titles) > 1:
            original_title = titles[1].get_text().replace("\n", " ").strip()
            if original_title.endswith("."):
                original_title = original_title[:-1]

        doi = None
        if record.doi:
            doi = clean_doi(record.doi.string.strip().lower())
            # TODO: following code is redundant with clean_doi()
            if not doi:
                return None
            if doi.startswith("http://dx.doi.org/"):
                doi = doi.replace("http://dx.doi.org/", "")
            elif doi.startswith("https://dx.doi.org/"):
                doi = doi.replace("https://dx.doi.org/", "")
            elif doi.startswith("http://doi.org/"):
                doi = doi.replace("http://doi.org/", "")
            elif doi.startswith("https://doi.org/"):
                doi = doi.replace("https://doi.org/", "")
            if not (doi.startswith("10.") and "/" in doi):
                sys.stderr.write("bogus JALC DOI: {}\n".format(doi))
                doi = None
        if not doi:
            return None

        people = record.find_all("Person")
        contribs = parse_jalc_persons(people)

        for i, contrib in enumerate(contribs):
            if contrib.raw_name != "et al.":
                contrib.index = i

        release_year = None
        release_date = None
        date = record.date or None
        if date:
            date = date.string
            if len(date) == 10:
                release_date_date = datetime.datetime.strptime(
                    date["completed-date"], DATE_FMT).date()
                release_year = release_date_date.year
                release_date = release_date_date.isoformat()
            elif len(date) == 4 and date.isdigit():
                release_year = int(date)

        pages = None
        if record.startingPage and record.startingPage.string.strip():
            pages = record.startingPage.string.strip()
            if record.endingPage and record.endingPage.string.strip():
                pages = "{}-{}".format(pages, record.endingPage.string.strip())
        # double check to prevent "-" as pages
        if pages and pages.strip() == "-":
            pages = None

        volume = None
        if record.volume:
            volume = record.volume.string
        issue = None
        if record.number:
            # note: number/issue transform
            issue = record.number.string

        # container
        issn = None
        issn_list = record.find_all("issn")
        if issn_list:
            # if we wanted the other ISSNs, would also need to uniq the list.
            # But we only need one to lookup ISSN-L/container
            issn = issn_list[0].string
        if issn:
            issnl = self.issn2issnl(issn)
        else:
            issnl = None
        container_id = None
        if issnl:
            container_id = self.lookup_issnl(issnl)

        publisher = None
        container_name = None
        container_extra: Dict[str, Any] = dict()

        if record.publicationName:
            pubs = [
                p.get_text().replace("\n", " ").strip()
                for p in record.find_all("publicationName") if p.get_text()
            ]
            pubs = [clean_str(p) for p in pubs if p]
            assert pubs
            if len(pubs) > 1 and pubs[0] == pubs[1]:
                pubs = [pubs[0]]
            if len(pubs) > 1 and is_cjk(pubs[0]):
                # eng/jpn ordering is not reliable
                pubs = [pubs[1], pubs[0]]
            container_name = clean_str(pubs[0])
            if len(pubs) > 1:
                container_extra["original_name"] = clean_str(pubs[1])

        if record.publisher:
            pubs = [
                p.get_text().replace("\n", " ").strip()
                for p in record.find_all("publisher") if p.get_text()
            ]
            pubs = [p for p in pubs if p]
            if len(pubs) > 1 and pubs[0] == pubs[1]:
                pubs = [pubs[0]]
            if len(pubs) > 1 and is_cjk(pubs[0]):
                # ordering is not reliable
                pubs = [pubs[1], pubs[0]]
            if pubs:
                publisher = clean_str(pubs[0])
                if len(pubs) > 1:
                    container_extra["publisher_aliases"] = pubs[1:]

        if (container_id is None and self.create_containers
                and (issnl is not None) and container_name):
            # name, type, publisher, issnl
            # extra: issnp, issne, original_name, languages, country
            container_extra["country"] = "jp"
            container_extra["languages"] = ["ja"]
            ce = fatcat_openapi_client.ContainerEntity(
                name=container_name,
                container_type="journal",
                publisher=publisher,
                issnl=issnl,
                extra=(container_extra or None),
            )
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            # short-cut future imports in same batch
            self._issnl_id_map[issnl] = container_id

        # the vast majority of works are in japanese
        # TODO: any indication when *not* in japanese?
        lang = "ja"

        # reasonable default for this collection
        release_type = "article-journal"

        # extra:
        #   translation_of
        #   aliases
        #   container_name
        #   group-title
        # always put at least an empty dict here to indicate the DOI registrar
        # (informally)
        extra["jalc"] = extra_jalc

        title = clean_str(title)
        if not title:
            return None

        re = ReleaseEntity(
            work_id=None,
            title=title,
            original_title=clean_str(original_title),
            release_type=release_type,
            release_stage="published",
            release_date=release_date,
            release_year=release_year,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(doi=doi, ),
            volume=volume,
            issue=issue,
            pages=pages,
            publisher=publisher,
            language=lang,
            # license_slug
            container_id=container_id,
            contribs=contribs,
            extra=extra,
        )
        return re