Esempio n. 1
0
    def update_entity(self, fe: FileEntity) -> None:
        """
        Mutates in place, updating fields with values from this form.

        Form must be validated *before* calling this function.
        """
        for simple_attr in FILE_SIMPLE_ATTRS:
            a = getattr(self, simple_attr).data
            # be flexible about hash capitalization
            if simple_attr in ("md5", "sha1", "sha256"):
                a = a.lower()
            # special case blank strings
            if a == "":
                a = None
            setattr(fe, simple_attr, a)
        fe.urls = []
        for u in self.urls:
            fe.urls.append(
                FileUrl(
                    rel=u.rel.data or None,
                    url=u.url.data or None,
                ))
        fe.release_ids = []
        for ri in self.release_ids:
            fe.release_ids.append(ri.data)
        if self.edit_description.data:
            fe.edit_extra = dict(description=self.edit_description.data)
Esempio n. 2
0
    def clean_entity(self, entity: FileEntity) -> FileEntity:
        """
        TODO: mimetype is bogus like (???) => clean mimetype
        """

        # URL has ://web.archive.org/web/None/ link => delete URL
        entity.urls = [
            u for u in entity.urls
            if "://web.archive.org/web/None/" not in u.url
        ]

        # URL has ://archive.org/ link with rel=repository => rel=archive
        for u in entity.urls:
            if "://archive.org/" in u.url and u.rel == "repository":
                u.rel = "archive"

        # URL has short wayback date ("2017") and another url with that as prefix => delete URL
        stub_wayback_urls = []
        full_wayback_urls = []
        for u in entity.urls:
            if "://web.archive.org/web/" in u.url:
                if len(u.url.split("/")[4]) <= 8:
                    stub_wayback_urls.append(u.url)
                else:
                    full_wayback_urls.append("/".join(u.url.split("/")[5:]))
        for stub in stub_wayback_urls:
            target = "/".join(stub.split("/")[5:])
            if target in full_wayback_urls:
                entity.urls = [u for u in entity.urls if u.url != stub]

        return entity
Esempio n. 3
0
    def generic_file_cleanups(existing: FileEntity) -> FileEntity:
        """
        Conservative cleanup of existing file entities.

        Intended to be used in most bulk cleanups and other file entity
        updates, to reduce edit volume for catalog size/churn efficiency.

        Note: the former check for 'None' as a wayback datetime has been
        completely cleaned up
        """

        # update old/deprecated 'rel' on URLs
        for i in range(len(existing.urls)):
            u = existing.urls[i]
            if u.rel == "repository" and "://archive.org/download/" in u.url:
                existing.urls[i].rel = "archive"
            if u.rel == "social":
                u.rel = "academicsocial"

        # remove exact URL duplicates, while preserving order, and removing
        # "later" copies, not "first" copies
        # this is sensitive to both url.url and url.rel combined!
        dedupe_urls = []
        for url_pair in existing.urls:
            if url_pair not in dedupe_urls:
                dedupe_urls.append(url_pair)
        existing.urls = dedupe_urls

        # remove URLs which are near-duplicates
        redundant_urls = []
        all_urls = [u.url for u in existing.urls]
        all_wayback_urls = [
            u.url for u in existing.urls if "://web.archive.org/web/" in u.url
        ]
        for url in all_urls:
            # https/http redundancy
            if url.startswith("http://") and url.replace(
                    "http://", "https://", 1) in all_urls:
                redundant_urls.append(url)
                continue
            # default HTTP port included and not included
            if ":80/" in url and url.replace(":80", "", 1) in all_urls:
                redundant_urls.append(url)
                continue
            # partial and complete wayback timestamps
            if "://web.archive.org/web/2017/" in url:
                original_url = "/".join(url.split("/")[5:])
                assert len(original_url) > 5
                for wb_url in all_wayback_urls:
                    alt_timestamp = wb_url.split("/")[4]
                    if len(alt_timestamp) >= 10 and original_url in wb_url:
                        redundant_urls.append(url)
                        break

        existing.urls = [
            u for u in existing.urls if u.url not in redundant_urls
        ]
        return existing
Esempio n. 4
0
    def parse_record(self, row: Dict[str, Any]) -> FileEntity:

        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode is False

        file_ident = uuid2fcid(row["file_ident"])
        wrong_release_ident = uuid2fcid(row["wrong_release_ident"])
        edit_extra = row["edit_extra"]
        assert edit_extra["link_source"] in ["unpaywall", "doi"]
        file_edit_doi = clean_doi(edit_extra["link_source_id"])

        if not file_edit_doi:
            self.counts["skip-bad-doi"] += 1
            return False

        # check that the "wrong" release exists and doesn't have the DOI
        wrong_release = None
        try:
            wrong_release = self.api.get_release(wrong_release_ident)
        except fatcat_openapi_client.rest.ApiException as err:
            if err.status != 404:
                raise err

        if not wrong_release:
            self.counts["skip-wrong-release-missing"] += 1
            return None

        if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi:
            self.counts["skip-wrong-release-is-ok"] += 1
            return None

        # fetch the "correct" release, if any
        fixed_release_ids = []
        correct_release = None
        try:
            correct_release = self.api.lookup_release(doi=file_edit_doi)
        except fatcat_openapi_client.rest.ApiException as err:
            if err.status != 404:
                raise err

        if correct_release:
            fixed_release_ids.append(correct_release.ident)

        fe = FileEntity(
            ident=file_ident,
            release_ids=fixed_release_ids,
            edit_extra=edit_extra,
        )
        fe._wrong_release_ident = wrong_release_ident
        return fe
Esempio n. 5
0
    def parse_record(self, row: Dict[str, Any]) -> FileEntity:

        request = row["request"]
        file_meta = row["file_meta"]

        # double check that want() filtered request correctly (eg, old requests)
        if request.get("ingest_type") not in ("pdf", "xml"):
            self.counts["skip-ingest-type"] += 1
            return None
        assert (request["ingest_type"], file_meta["mimetype"]) in [
            ("pdf", "application/pdf"),
            ("xml", "application/xml"),
            ("xml", "application/jats+xml"),
            ("xml", "application/tei+xml"),
            ("xml", "text/xml"),
        ]

        # identify release by fatcat ident, or extid lookup, or biblio-glutton match
        release_ident = self.parse_ingest_release_ident(row)

        if not release_ident:
            self.counts["skip-release-not-found"] += 1
            return None

        terminal = self.parse_terminal(row)
        if not terminal:
            # TODO: support archive.org hits?
            self.counts["skip-no-terminal"] += 1
            return None

        urls = self.parse_urls(row, terminal)

        fe = FileEntity(
            md5=file_meta["md5hex"],
            sha1=file_meta["sha1hex"],
            sha256=file_meta["sha256hex"],
            size=file_meta["size_bytes"],
            mimetype=file_meta["mimetype"],
            release_ids=[release_ident],
            urls=urls,
        )

        edit_extra = self.parse_edit_extra(row)
        if edit_extra:
            fe.edit_extra = edit_extra
        return fe
Esempio n. 6
0
def test_file_meta_importer_basic(file_meta_importer):

    # insert two file entities
    api = file_meta_importer.api
    eg = quick_eg(file_meta_importer.api)
    # with full metadata
    f1edit = api.create_file(
        eg.editgroup_id,
        FileEntity(
            size=372121,
            md5="e1fd97475c8aa102568f5d70a1bd0c07",
            sha1="0000045687dad717ed6512e395b04ec9c00995b7",
            sha256=
            "51bdc9e40cc175089fcb60b0b188e6cbcdcddb1ff8acbe6b039b8f8fff0afff0",
            mimetype="application/pdf",
        ))
    # partial/stub metadata
    f2edit = api.create_file(
        eg.editgroup_id,
        FileEntity(
            sha1="00000376ad49f56145721503f1eb5e6e49e779fd",
            mimetype="application/pdf",
        ))
    api.accept_editgroup(eg.editgroup_id)

    last_index = file_meta_importer.api.get_changelog(limit=1)[0].index

    with open('tests/files/example_file_meta.json', 'r') as f:
        counts = JsonLinePusher(file_meta_importer, f).run()

    assert counts['insert'] == 0
    assert counts['exists'] == 0
    assert counts['update'] == 1
    assert counts['skip-no-match'] == 4
    assert counts['skip-missing-field'] == 1
    assert counts['skip-existing-complete'] == 1

    # cleanup file entities
    eg = quick_eg(file_meta_importer.api)
    api.delete_file(eg.editgroup_id, f1edit.ident)
    api.delete_file(eg.editgroup_id, f2edit.ident)
    api.accept_editgroup(eg.editgroup_id)
Esempio n. 7
0
def generic_deleted_entity(entity_type: str, ident: str) -> Any:
    if entity_type == "container":
        entity: Any = ContainerEntity()
    elif entity_type == "creator":
        entity = CreatorEntity()
    elif entity_type == "file":
        entity = FileEntity()
    elif entity_type == "fileset":
        entity = FilesetEntity()
    elif entity_type == "webcapture":
        entity = WebcaptureEntity()
    elif entity_type == "release":
        entity = ReleaseEntity(ext_ids=ReleaseExtIds())
    elif entity_type == "work":
        entity = WorkEntity()
    else:
        raise NotImplementedError
    entity.ident = ident
    entity.state = "deleted"
    return entity
Esempio n. 8
0
    def parse_record(self, obj: Dict[str, Any]) -> Optional[FileEntity]:
        dois = [d.lower() for d in obj.get("dois", [])]

        # lookup dois
        re_list = set()
        for doi in dois:
            doi = clean_doi(doi)
            if not doi:
                self.counts["skip-bad-doi"] += 1
                return None
            try:
                re = self.api.lookup_release(doi=doi)
            except fatcat_openapi_client.rest.ApiException as err:
                if err.status != 404:
                    raise err
                re = None
            if re is None:
                # print("DOI not found: {}".format(doi))
                pass
            else:
                re_list.add(re.ident)

        # look up other external ids
        for extid_type in (
            "arxiv",
            "pmid",
            "pmcid",
            "jstor",
            "wikidata_qid",
            "core",
            "isbn13",
            "ark",
        ):
            extid = obj.get(extid_type)
            if extid:
                try:
                    re = self.api.lookup_release(**{extid_type: extid})
                except fatcat_openapi_client.rest.ApiException as err:
                    if err.status != 404:
                        raise err
                    re = None
                if re is None:
                    pass
                else:
                    re_list.add(re.ident)

        release_ids = list(re_list)
        if len(release_ids) == 0:
            self.counts["skip-no-releases"] += 1
            return None
        if len(release_ids) > SANE_MAX_RELEASES:
            self.counts["skip-too-many-releases"] += 1
            return None

        # parse URLs and CDX
        urls_set = set()
        for url in obj.get("urls", []):
            url = make_rel_url(url, default_link_rel=self.default_link_rel)
            if url is not None:
                urls_set.add(url)
        for cdx in obj.get("cdx", []):
            original = cdx["url"]
            if cdx.get("dt"):
                wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)
                urls_set.add(("webarchive", wayback))
            url = make_rel_url(original, default_link_rel=self.default_link_rel)
            if url is not None:
                urls_set.add(url)
        urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls_set]
        if len(urls) == 0:
            self.counts["skip-no-urls"] += 1
            return None
        if len(urls) > SANE_MAX_URLS:
            self.counts["skip-too-many-urls"] += 1
            return None

        size = obj.get("size")
        if size:
            size = int(size)

        mimetype = obj.get("mimetype", self.default_mimetype)
        if not mimetype and urls:
            if urls[0].url.endswith(".pdf"):
                mimetype = "application/pdf"

        fe = FileEntity(
            md5=obj.get("md5"),
            sha1=obj["sha1"],
            sha256=obj.get("sha256"),
            size=size,
            mimetype=mimetype,
            release_ids=release_ids,
            urls=urls,
        )
        return fe
Esempio n. 9
0
 def to_entity(self) -> FileEntity:
     assert self.sha1.data
     entity = FileEntity()
     self.update_entity(entity)
     return entity
Esempio n. 10
0
def test_rich_elasticsearch_convert():
    r = ReleaseEntity(
        title="something",
        release_year=1234,
        license_slug="CC-BY-NC",
        ext_ids=ReleaseExtIds(),
        refs=[
            ReleaseRef(),
            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
        ],
    )
    r.state = "active"
    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "ia": {
                "sim": {
                    "year_spans": [[1000, 1100]],
                },
            },
            "kbart": {
                "lockss": {
                    "year_spans": [[1200, 1300]],
                },
                "jstor": {
                    "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]],
                },
            },
            "sherpa_romeo": {
                "color": "blue"
            },
            "doaj": {
                "as_of": "2010-02-03"
            },
        },
    )
    r.files = [
        FileEntity(
            mimetype="application/pdf",
            urls=[
                FileUrl(rel="dweb", url="dat://a954329dlk/thingie"),
                FileUrl(
                    rel="webarchive",
                    url=
                    "https://web.archive.org/web/20001122030405/http://example.com",
                ),
                FileUrl(rel="web",
                        url="https://archive.org/details/blah/file.pdf"),
            ],
            extra={
                "shadows": {},
            },
        )
    ]
    es = release_to_elasticsearch(r)
    assert es["release_year"] == r.release_year
    assert es["file_count"] == 1
    assert es["fileset_count"] == 0
    assert es["webcapture_count"] == 0
    assert es["ref_count"] == 2
    assert es["ref_linked_count"] == 1

    assert es["preservation"] == "bright"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is True
    assert es["in_dweb"] is True
    assert es["in_ia"] is True
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is True
    assert es["in_jstor"] is True
Esempio n. 11
0
def enrich_file_entity(entity: FileEntity) -> FileEntity:
    if entity.state == "active":
        entity._es = file_to_elasticsearch(entity)
    return entity
Esempio n. 12
0
 def to_entity(self):
     assert(self.sha1.data)
     entity = FileEntity()
     self.update_entity(entity)
     return entity
Esempio n. 13
0
    def parse_record(self, row: Dict[str, Any]) -> FileEntity:

        request = row["request"]

        # double check that want() filtered request correctly
        if request.get("ingest_type") not in [
                "dataset",
        ]:
            self.counts["skip-ingest-type"] += 1
            return None

        # identify release by fatcat ident, or extid lookup
        release_ident = self.parse_ingest_release_ident(row)

        if not release_ident:
            self.counts["skip-release-not-found"] += 1
            return None

        assert row["file_count"] == len(row["manifest"]) == 1
        file_meta = row["manifest"][0]
        # print(file_meta)
        assert file_meta["status"] == "success"

        # add file-level access URLs
        entity_urls = []
        if file_meta.get("platform_url"):
            entity_urls.append(
                FileUrl(rel="web", url=file_meta["platform_url"]))
        if file_meta.get("terminal_url") and file_meta.get("terminal_dt"):
            entity_urls.append(
                FileUrl(
                    rel="webarchive",
                    url=
                    f"https://web.archive.org/web/{file_meta['terminal_dt']}/{file_meta['terminal_url']}",
                ))
        if row["ingest_strategy"] == "archiveorg-file":
            entity_urls.append(
                FileUrl(
                    rel="archive",
                    url=
                    f"https://archive.org/download/{row['archiveorg_item_name']}/{file_meta['path']}",
                ))

        if not entity_urls:
            self.counts["skip-no-access-url"] += 1
            return None

        entity_extra: Dict[str, Any] = dict()
        entity_extra["path"] = file_meta["path"]

        # this is to work around a bug in old sandcrawler ingest code
        if file_meta["md5"] == file_meta["sha1"]:
            self.counts["skip-bad-hashes"] += 1
            return None

        fe = FileEntity(
            md5=file_meta["md5"],
            sha1=file_meta["sha1"],
            sha256=file_meta["sha256"],
            size=file_meta["size"],
            mimetype=file_meta["mimetype"],
            release_ids=[release_ident],
            urls=entity_urls,
            extra=entity_extra or None,
        )
        if not (fe.md5 and fe.sha1 and fe.sha256 and
                (fe.size is not None) and fe.mimetype):
            self.counts["skip-partial-file-info"] += 1
            return None

        edit_extra = self.parse_edit_extra(row)
        if edit_extra:
            fe.edit_extra = edit_extra
        return fe
Esempio n. 14
0
def test_merge_file_metadata_from(api) -> None:
    fm = FileMerger(api=api)
    fe_partial = FileEntity(
        ident="aaaasb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3aaaaaa",
    )
    fe_norelease = FileEntity(
        ident="bbbbsb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb",
        md5="d2c7318315bfc7d3aab0db933e95e632",
        sha256=
        "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163",
        size=60719,
        mimetype="application/pdf",
    )
    fe_nourls = FileEntity(
        ident="ccccsb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb",
        md5="d2c7318315bfc7d3aab0db933e95e632",
        sha256=
        "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163",
        size=60719,
        mimetype="application/pdf",
        release_ids=["dlrxjg7mxrayxfltget7fqcrjy"],
    )
    fe_complete = FileEntity(
        ident="ddddsb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb",
        md5="ddddddd315bfc7d3aab0db933e95e632",
        sha256=
        "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163",
        size=60719,
        mimetype="application/pdf",
        release_ids=["dlrxjg7mxrayxfltget7fqcrjy"],
        urls=[
            FileUrl(rel="web", url="http://aughty.org/pdf/future_open.pdf"),
        ],
        extra=dict(asdf=123),
    )
    fe_pseudo_complete = FileEntity(
        ident="eeeesb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb",
        sha256=
        "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163",
        size=60719,
        mimetype="application/pdf",
        release_ids=["dlrxjg7mxrayxfltget7fqcrjy"],
        urls=[
            FileUrl(rel="web", url="http://aughty.org/pdf/future_open.pdf"),
        ],
        extra=dict(asdf=123),
    )
    fe_another_release_id = FileEntity(
        ident="fffffffapzfhbbxxc7rgu2yw6m",
        release_ids=["qqqqqg7mxrayxfltget7fqcrjy"],
    )
    fe_another_url = FileEntity(
        ident="zzzzzzzapzfhbbxxc7rgu2yw6m",
        urls=[
            FileUrl(rel="repository", url="http://someuni.edu/repo/file.pdf"),
        ],
    )
    fe_more_extra = FileEntity(
        ident="fffffffapzfhbbxxc7rgu2yw6m",
        release_ids=["qqqqqg7mxrayxfltget7fqcrjy"],
        extra=dict(thang=456),
    )

    assert fm.merge_file_metadata_from(fe_nourls, fe_partial) is False
    assert fm.merge_file_metadata_from(fe_complete,
                                       fe_pseudo_complete) is False
    assert fm.merge_file_metadata_from(fe_complete, fe_complete) is False
    assert fm.merge_file_metadata_from(fe_partial, fe_norelease) is True
    assert fe_partial.md5 == fe_norelease.md5
    assert fe_partial.size == fe_norelease.size
    assert fm.merge_file_metadata_from(fe_partial, fe_complete) is True
    assert fe_partial.md5 != fe_complete.md5
    assert fe_partial.extra == fe_complete.extra
    assert set([(u.rel, u.url) for u in fe_partial.urls or []
                ]) == set([(u.rel, u.url) for u in fe_complete.urls or []])
    assert fe_partial.release_ids == fe_complete.release_ids
    assert fm.merge_file_metadata_from(fe_partial,
                                       fe_another_release_id) is True
    assert fe_partial.release_ids == [
        "dlrxjg7mxrayxfltget7fqcrjy",
        "qqqqqg7mxrayxfltget7fqcrjy",
    ]
    assert fm.merge_file_metadata_from(fe_partial,
                                       fe_another_release_id) is False
    assert fm.merge_file_metadata_from(fe_partial, fe_more_extra) is True
    assert fe_partial.extra == dict(asdf=123, thang=456)
    assert fm.merge_file_metadata_from(fe_partial, fe_more_extra) is False
    assert fm.merge_file_metadata_from(fe_partial, fe_another_url) is True
    assert fe_partial.urls[-1].url == "http://someuni.edu/repo/file.pdf"
    assert fm.merge_file_metadata_from(fe_partial, fe_another_url) is False
Esempio n. 15
0
def test_choose_primary_file(api) -> None:

    fm = FileMerger(api=api)
    fe_partial = FileEntity(
        ident="aaaasb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3aaaaaa",
    )
    fe_norelease = FileEntity(
        ident="bbbbsb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb",
        md5="d2c7318315bfc7d3aab0db933e95e632",
        sha256=
        "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163",
        size=60719,
        mimetype="application/pdf",
    )
    fe_nourls = FileEntity(
        ident="ccccsb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb",
        md5="d2c7318315bfc7d3aab0db933e95e632",
        sha256=
        "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163",
        size=60719,
        mimetype="application/pdf",
        release_ids=["dlrxjg7mxrayxfltget7fqcrjy"],
    )
    fe_complete = FileEntity(
        ident="ddddsb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb",
        md5="d2c7318315bfc7d3aab0db933e95e632",
        sha256=
        "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163",
        size=60719,
        mimetype="application/pdf",
        release_ids=["dlrxjg7mxrayxfltget7fqcrjy"],
        urls=[
            FileUrl(rel="web", url="http://aughty.org/pdf/future_open.pdf"),
        ],
        extra=dict(asdf=123),
    )
    fe_pseudo_complete = FileEntity(
        ident="eeeesb5apzfhbbxxc7rgu2yw6m",
        sha1="b1beebb5f979121cd234c69b08e3f42af3bbbbbb",
        sha256=
        "528064c7664a96c79c80c423210f6f9f4fafe949dd59dfd1572a04b906d5e163",
        size=60719,
        mimetype="application/pdf",
        release_ids=["dlrxjg7mxrayxfltget7fqcrjy"],
        urls=[
            FileUrl(rel="web", url="http://aughty.org/pdf/future_open.pdf"),
        ],
        extra=dict(asdf=123),
    )

    assert fm.choose_primary_file([fe_partial, fe_norelease
                                   ]) == "bbbbsb5apzfhbbxxc7rgu2yw6m"
    assert (fm.choose_primary_file([fe_partial, fe_nourls, fe_norelease
                                    ]) == "ccccsb5apzfhbbxxc7rgu2yw6m")
    assert (fm.choose_primary_file(
        [fe_partial, fe_complete, fe_nourls,
         fe_norelease]) == "ddddsb5apzfhbbxxc7rgu2yw6m")
    assert (fm.choose_primary_file(
        [fe_partial, fe_pseudo_complete, fe_nourls,
         fe_norelease]) == "ccccsb5apzfhbbxxc7rgu2yw6m")