Beispiel #1
0
def test_elasticsearch_release_kbart_year():
    this_year = datetime.date.today().year
    r = ReleaseEntity(
        title="something",
        release_year=this_year,
        license_slug="CC-BY-NC",
        ext_ids=ReleaseExtIds(),
        refs=[
            ReleaseRef(),
            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
        ],
    )
    r.state = "active"
    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "kbart": {
                "lockss": {
                    "year_spans": [[1900, this_year - 2]],
                },
            },
        },
    )
    es = release_to_elasticsearch(r)
    assert es["release_year"] == this_year

    assert es["preservation"] == "none"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is False
    assert es["in_web"] is False
    assert es["in_dweb"] is False
    assert es["in_ia"] is False
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is False
    assert es["in_jstor"] is False

    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "kbart": {
                "lockss": {
                    "year_spans": [[1900, this_year - 1]],
                },
            },
        },
    )
    es = release_to_elasticsearch(r)
    assert es["release_year"] == this_year

    assert es["preservation"] == "dark"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is False
    assert es["in_dweb"] is False
    assert es["in_ia"] is False
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is True
    assert es["in_jstor"] is False
Beispiel #2
0
    def update_entity(self, ce: ContainerEntity) -> None:
        """
        Mutates a container entity in place, updating fields with values from
        this form.

        Form must be validated *before* calling this function.
        """
        for simple_attr in CONTAINER_SIMPLE_ATTRS:
            a = getattr(self, simple_attr).data
            # special case blank strings
            if a == "":
                a = None
            setattr(ce, simple_attr, a)
        if not ce.extra:
            ce.extra = dict()
        for extra_attr in CONTAINER_EXTRA_ATTRS:
            a = getattr(self, extra_attr).data
            if a and a != "":
                ce.extra[extra_attr] = a
        extra_urls = []
        for url in self.urls:
            extra_urls.append(url.data)
        if extra_urls:
            ce.extra["urls"] = extra_urls
        if self.edit_description.data:
            ce.edit_extra = dict(description=self.edit_description.data)
        if not ce.extra:
            ce.extra = None
Beispiel #3
0
    def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]:
        """
        row is a python dict (parsed from JSON).

        returns a ContainerEntity (or None if invalid or couldn't parse)
        """

        name = clean_str(row.get("name"))
        if name and name.endswith("."):
            name = name[:-1]
        if not name:
            # Name is required (by schema)
            return None

        extra = dict()
        for k in (
                "urls",
                "webarchive_urls",
                "country",
                "sherpa_romeo",
                "ezb",
                "szczepanski",
                "doaj",
                "languages",
                "ia",
                "scielo",
                "kbart",
                "publisher_type",
                "platform",
        ):
            if row["extra"].get(k):
                extra[k] = row["extra"][k]

        container_type = None
        if "proceedings" in name.lower():
            container_type = "proceedings"
        elif "journal " in name.lower():
            container_type = "journal"

        if row["extra"].get("issnp"):
            row["extra"]["issnp"] = row["extra"]["issnp"].upper()
        if row["extra"].get("issne"):
            row["extra"]["issne"] = row["extra"]["issne"].upper()
        ce = ContainerEntity(
            issnl=row["issnl"],
            issnp=row["extra"].get("issnp"),
            issne=row["extra"].get("issne"),
            ident=row["ident"],
            name=name,
            container_type=container_type,
            publisher=clean_str(row.get("publisher")),
            wikidata_qid=row.get("wikidata_qid"),
            extra=extra,
        )
        return ce
Beispiel #4
0
def generic_deleted_entity(entity_type: str, ident: str) -> Any:
    if entity_type == "container":
        entity: Any = ContainerEntity()
    elif entity_type == "creator":
        entity = CreatorEntity()
    elif entity_type == "file":
        entity = FileEntity()
    elif entity_type == "fileset":
        entity = FilesetEntity()
    elif entity_type == "webcapture":
        entity = WebcaptureEntity()
    elif entity_type == "release":
        entity = ReleaseEntity(ext_ids=ReleaseExtIds())
    elif entity_type == "work":
        entity = WorkEntity()
    else:
        raise NotImplementedError
    entity.ident = ident
    entity.state = "deleted"
    return entity
Beispiel #5
0
 def to_entity(self) -> ContainerEntity:
     assert self.name.data
     entity = ContainerEntity(name=self.name.data)
     self.update_entity(entity)
     return entity
Beispiel #6
0
def test_rich_elasticsearch_convert():
    r = ReleaseEntity(
        title="something",
        release_year=1234,
        license_slug="CC-BY-NC",
        ext_ids=ReleaseExtIds(),
        refs=[
            ReleaseRef(),
            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
        ],
    )
    r.state = "active"
    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "ia": {
                "sim": {
                    "year_spans": [[1000, 1100]],
                },
            },
            "kbart": {
                "lockss": {
                    "year_spans": [[1200, 1300]],
                },
                "jstor": {
                    "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]],
                },
            },
            "sherpa_romeo": {
                "color": "blue"
            },
            "doaj": {
                "as_of": "2010-02-03"
            },
        },
    )
    r.files = [
        FileEntity(
            mimetype="application/pdf",
            urls=[
                FileUrl(rel="dweb", url="dat://a954329dlk/thingie"),
                FileUrl(
                    rel="webarchive",
                    url=
                    "https://web.archive.org/web/20001122030405/http://example.com",
                ),
                FileUrl(rel="web",
                        url="https://archive.org/details/blah/file.pdf"),
            ],
            extra={
                "shadows": {},
            },
        )
    ]
    es = release_to_elasticsearch(r)
    assert es["release_year"] == r.release_year
    assert es["file_count"] == 1
    assert es["fileset_count"] == 0
    assert es["webcapture_count"] == 0
    assert es["ref_count"] == 2
    assert es["ref_linked_count"] == 1

    assert es["preservation"] == "bright"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is True
    assert es["in_dweb"] is True
    assert es["in_ia"] is True
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is True
    assert es["in_jstor"] is True
Beispiel #7
0
    def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]:
        """
        row is a python dict (parsed from JSON).

        returns a ContainerEntity (or None if invalid or couldn't parse)
        """

        if not row.get("name"):
            # Name is required (by schema)
            return None

        extra = dict()
        for key in (
                "issne",
                "issnp",
                "languages",
                "country",
                "urls",
                "abbrev",
                "coden",
                "aliases",
                "original_name",
                "first_year",
                "last_year",
                "platform",
                "default_license",
                "road",
                "mimetypes",
                "sherpa_romeo",
                "kbart",
        ):
            if row.get(key):
                extra[key] = row[key]
        # TODO: not including for now: norwegian, dois/crossref, ia

        extra_doaj = dict()
        if row.get("doaj"):
            if row["doaj"].get("as_of"):
                extra_doaj["as_of"] = row["doaj"]["as_of"]
            if row["doaj"].get("works"):
                extra_doaj["works"] = row["doaj"]["works"]
        if extra_doaj:
            extra["doaj"] = extra_doaj

        extra_ia = dict()
        # TODO: would like an ia.longtail_ia flag
        if row.get("sim"):
            # NB: None case of the .get() here is blech, but othrwise
            # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on
            extra_ia["sim"] = {
                "year_spans": row["sim"].get("year_spans"),
            }
        if extra_ia:
            extra["ia"] = extra_ia

        name = clean_str(row.get("name"))
        if not name:
            return None

        ce = ContainerEntity(
            issnl=row["issnl"],
            issne=row.get("issne"),
            issnp=row.get("issnp"),
            container_type=None,  # TODO
            name=name,
            publisher=clean_str(row.get("publisher")),
            wikidata_qid=None,  # TODO
            extra=extra,
        )
        return ce
Beispiel #8
0
def enrich_container_entity(entity: ContainerEntity) -> ContainerEntity:
    if entity.state in ("redirect", "deleted"):
        return entity
    if entity.state == "active":
        entity._es = container_to_elasticsearch(entity, force_bool=False)
    return entity
Beispiel #9
0
def test_choose_primary_container(api) -> None:

    release_counts = dict()
    redirects = dict()
    em = ContainerMerger(api=api)

    ce_stub = ContainerEntity(
        ident="pppppp5apzfhbbxxc7rgu2yw6m",
        name="dummy journal",
    )
    release_counts[ce_stub.ident] = 0
    redirects[ce_stub.ident] = []

    ce_partial = ContainerEntity(
        ident="eeeeeeeapzfhbbxxc7rgu2yw6m",
        name="dummy complete journal",
        publisher="some publisher",
        issnl="1234-5678",
        publication_status="active",
        extra=dict(asdf=123, ia=dict(asdf=True)),
    )
    release_counts[ce_partial.ident] = 0
    redirects[ce_partial.ident] = []

    ce_partial_redirects = ContainerEntity(
        ident="rrrrrrrrrrfhbbxxc7rgu2yw6m",
        name="dummy complete journal",
        publisher="some publisher",
        issnl="1234-5678",
        publication_status="active",
        extra=dict(asdf=123, ia=dict(asdf=True)),
    )
    release_counts[ce_partial_redirects.ident] = 0
    redirects[ce_partial_redirects.ident] = [
        "zzzzzzzzrrfhbbxxc7rgu2yw6m",
    ]

    ce_complete_zero = ContainerEntity(
        ident="oooooooapzfhbbxxc7rgu2yw6m",
        name="dummy complete journal",
        publisher="some publisher",
        issnl="1234-5678",
        publication_status="active",
        extra=dict(asdf=123, ia=dict(asdf=True)),
    )
    release_counts[ce_complete_zero.ident] = 0
    redirects[ce_complete_zero.ident] = []

    ce_complete_small = ContainerEntity(
        ident="cccccccapzfhbbxxc7rgu2yw6m",
        name="dummy complete journal",
        publisher="some publisher",
        issnl="1234-5678",
        publication_status="active",
        extra=dict(asdf=123, ia=dict(asdf=True)),
    )
    release_counts[ce_complete_small.ident] = 10
    redirects[ce_complete_small.ident] = []

    ce_complete_big = ContainerEntity(
        ident="ddddddddpzfhbbxxc7rgu2yw6m",
        name="dummy complete journal",
        publisher="some publisher",
        issnl="1234-5678",
        publication_status="active",
        extra=dict(asdf=123, ia=dict(asdf=True)),
    )
    release_counts[ce_complete_big.ident] = 9999999
    redirects[ce_complete_big.ident] = []

    assert (em.choose_primary_container([ce_stub, ce_partial], redirects,
                                        release_counts) == ce_partial.ident)
    assert (em.choose_primary_container(
        [ce_stub, ce_complete_zero, ce_partial], redirects,
        release_counts) == ce_complete_zero.ident)
    assert (em.choose_primary_container(
        [ce_stub, ce_partial_redirects, ce_complete_zero, ce_partial],
        redirects,
        release_counts,
    ) == ce_partial_redirects.ident)
    assert (em.choose_primary_container(
        [ce_stub, ce_complete_zero, ce_complete_small, ce_partial],
        redirects,
        release_counts,
    ) == ce_complete_small.ident)
    assert (em.choose_primary_container(
        [
            ce_stub, ce_complete_big, ce_complete_zero, ce_complete_small,
            ce_partial
        ],
        redirects,
        release_counts,
    ) == ce_complete_big.ident)
    assert (em.choose_primary_container(
        [ce_complete_small, ce_complete_big], redirects,
        release_counts) == ce_complete_big.ident)