Beispiel #1
0
    def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):

        sha1 = base64.b16encode(base64.b32decode(sha1_key.replace(
            'sha1:', ''))).decode('ascii').lower()

        fe = fatcat_openapi_client.FileEntity(
            sha1=sha1,
            size=int(file_size),
            mimetype=mimetype,
            release_ids=[],
            urls=[],
        )

        # parse URLs and CDX
        original = cdx['url']
        assert len(cdx['dt']) >= 8
        wayback = "https://web.archive.org/web/{}/{}".format(
            cdx['dt'], original)
        fe.urls.append(
            fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))
        original_url = make_rel_url(original,
                                    default_link_rel=self.default_link_rel)
        if original_url is not None:
            fe.urls.append(
                fatcat_openapi_client.FileUrl(rel=original_url[0],
                                              url=original_url[1]))

        return fe
Beispiel #2
0
    def parse_record(self, row):

        extid = row['identifier'].strip()

        # check/cleanup DOI
        if self.extid_type == 'doi':
            extid = extid.lower()
            extid.replace('http://doi.org/', '')
            extid.replace('https://doi.org/', '')
            if extid.startswith('doi:'):
                extid = extid[4:]
            if not extid.startswith('10.'):
                self.counts['skip-extid-invalid']
                return None

        # lookup extid
        try:
            re = self.api.lookup_release(**{self.extid_type: extid})
        except fatcat_openapi_client.rest.ApiException as err:
            if err.status == 404:
                # bail on 404 (release not in DB)
                self.counts['skip-extid-not-found'] += 1
                return None
            elif err.status == 400:
                self.counts['skip-extid-invalid'] += 1
                return None
            else:
                raise err

        url = make_rel_url(row['final_url'], self.default_link_rel)
        if not url:
            self.counts['skip-url'] += 1
            return None
        if not row['final_timestamp']:
            self.counts['skip-missing-timestamp'] += 1
            return None
        wayback = "https://web.archive.org/web/{}/{}".format(
            row['final_timestamp'], row['final_url'])
        urls = [url, ("webarchive", wayback)]

        urls = [
            fatcat_openapi_client.FileUrl(rel=rel, url=url)
            for (rel, url) in urls
        ]

        if len(urls) > SANE_MAX_URLS:
            self.counts['skip-too-many-url'] += 1
            return None

        fe = fatcat_openapi_client.FileEntity(
            sha1=b32_hex(row['final_sha1']),
            mimetype=row['final_mimetype'] or self.default_mimetype,
            release_ids=[re.ident],
            urls=urls,
        )
        return fe
Beispiel #3
0
    def parse_record(self, row):

        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode == False

        file_meta = row
        fe = fatcat_openapi_client.FileEntity(
            md5=file_meta['md5hex'],
            sha1=file_meta['sha1hex'],
            sha256=file_meta['sha256hex'],
            size=file_meta['size_bytes'],
            mimetype=file_meta['mimetype'],
        )
        return fe
Beispiel #4
0
    def parse_record(self, row: Dict[str, Any]) -> FileEntity:

        # bezerk mode doesn't make sense for this importer
        assert self.bezerk_mode is False

        file_meta = row
        fe = fatcat_openapi_client.FileEntity(
            md5=file_meta["md5hex"],
            sha1=file_meta["sha1hex"],
            sha256=file_meta["sha256hex"],
            size=file_meta["size_bytes"],
            mimetype=file_meta["mimetype"],
        )
        return fe
Beispiel #5
0
    def parse_record(self, row):

        request = row['request']
        file_meta = row['file_meta']

        # double check that want() filtered request correctly (eg, old requests)
        if request.get('ingest_type') not in ('pdf', 'xml'):
            self.counts['skip-ingest-type'] += 1
            return None
        assert (request['ingest_type'], file_meta['mimetype']) in [
            ("pdf", "application/pdf"),
            ("xml", "application/xml"),
            ("xml", "application/jats+xml"),
            ("xml", "application/tei+xml"),
            ("xml", "text/xml"),
        ]

        # identify release by fatcat ident, or extid lookup, or biblio-glutton match
        release_ident = self.parse_ingest_release_ident(row)

        if not release_ident:
            self.counts['skip-release-not-found'] += 1
            return None

        terminal = self.parse_terminal(row)
        if not terminal:
            # TODO: support archive.org hits?
            self.counts['skip-no-terminal'] += 1
            return None

        urls = self.parse_urls(row, terminal)

        fe = fatcat_openapi_client.FileEntity(
            md5=file_meta['md5hex'],
            sha1=file_meta['sha1hex'],
            sha256=file_meta['sha256hex'],
            size=file_meta['size_bytes'],
            mimetype=file_meta['mimetype'],
            release_ids=[release_ident],
            urls=urls,
        )

        edit_extra = self.parse_edit_extra(row)
        if edit_extra:
            fe.edit_extra = edit_extra
        return fe
Beispiel #6
0
    def parse_record(self, obj):
        """
        We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
        """

        shadow_corpus = obj['shadow']['shadow_corpus']
        assert shadow_corpus == shadow_corpus.strip().lower()
        doi = clean_doi(obj['shadow'].get('doi'))
        pmid = clean_pmid(obj['shadow'].get('pmid'))
        isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
        shadow_id = obj['shadow'].get('shadow_id').strip()
        assert shadow_id

        extra = {'{}_id'.format(shadow_corpus): shadow_id}
        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid),
                                   ('isbn13', isbn13)]:
            if not ext_id:
                continue
            extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id

        # lookup release via several idents
        re = None
        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid),
                                   ('isbn13', isbn13)]:
            if not ext_id:
                continue
            try:
                re = self.api.lookup_release(**{ext_type: ext_id})
            except fatcat_openapi_client.rest.ApiException as err:
                if err.status not in (404, 400):
                    raise err
                re = None
            if re:
                break

        if not re:
            self.counts['skip-release-not-found'] += 1
            return None

        release_ids = [
            re.ident,
        ]

        # parse single CDX into URLs (if exists)
        urls = []
        if obj.get('cdx'):
            url = make_rel_url(obj['cdx']['url'],
                               default_link_rel=self.default_link_rel)
            if url != None:
                urls.append(url)
            wayback = "https://web.archive.org/web/{}/{}".format(
                obj['cdx']['datetime'], obj['cdx']['url'])
            urls.append(("webarchive", wayback))
        urls = [
            fatcat_openapi_client.FileUrl(rel=rel, url=url)
            for (rel, url) in urls
        ]

        fe = fatcat_openapi_client.FileEntity(
            md5=obj['file_meta']['md5hex'],
            sha1=obj['file_meta']['sha1hex'],
            sha256=obj['file_meta']['sha256hex'],
            size=int(obj['file_meta']['size_bytes']),
            mimetype=obj['file_meta']['mimetype'] or None,
            release_ids=release_ids,
            urls=urls,
            extra=dict(shadows=extra),
        )
        return fe
Beispiel #7
0
    def parse_record(self, obj):
        dois = [d.lower() for d in obj.get('dois', [])]

        # lookup dois
        re_list = set()
        for doi in dois:
            doi = clean_doi(doi)
            if not doi:
                self.counts['skip-bad-doi'] += 1
                return None
            try:
                re = self.api.lookup_release(doi=doi)
            except fatcat_openapi_client.rest.ApiException as err:
                if err.status != 404:
                    raise err
                re = None
            if re is None:
                #print("DOI not found: {}".format(doi))
                pass
            else:
                re_list.add(re.ident)

        # look up other external ids
        for extid_type in ('arxiv', 'pmid', 'pmcid', 'jstor', 'wikidata_qid',
                           'core', 'isbn13', 'ark'):
            extid = obj.get(extid_type)
            if extid:
                try:
                    re = self.api.lookup_release(**{extid_type: extid})
                except fatcat_openapi_client.rest.ApiException as err:
                    if err.status != 404:
                        raise err
                    re = None
                if re is None:
                    pass
                else:
                    re_list.add(re.ident)

        release_ids = list(re_list)
        if len(release_ids) == 0:
            self.counts['skip-no-releases'] += 1
            return None
        if len(release_ids) > SANE_MAX_RELEASES:
            self.counts['skip-too-many-releases'] += 1
            return None

        # parse URLs and CDX
        urls = set()
        for url in obj.get('urls', []):
            url = make_rel_url(url, default_link_rel=self.default_link_rel)
            if url != None:
                urls.add(url)
        for cdx in obj.get('cdx', []):
            original = cdx['url']
            if cdx.get('dt'):
                wayback = "https://web.archive.org/web/{}/{}".format(
                    cdx['dt'], original)
                urls.add(("webarchive", wayback))
            url = make_rel_url(original,
                               default_link_rel=self.default_link_rel)
            if url != None:
                urls.add(url)
        urls = [
            fatcat_openapi_client.FileUrl(rel=rel, url=url)
            for (rel, url) in urls
        ]
        if len(urls) == 0:
            self.counts['skip-no-urls'] += 1
            return None
        if len(urls) > SANE_MAX_URLS:
            self.counts['skip-too-many-urls'] += 1
            return None

        size = obj.get('size')
        if size:
            size = int(size)

        mimetype = obj.get('mimetype', self.default_mimetype)
        if not mimetype and urls:
            if urls[0].url.endswith('.pdf'):
                mimetype = 'application/pdf'

        fe = fatcat_openapi_client.FileEntity(
            md5=obj.get('md5'),
            sha1=obj['sha1'],
            sha256=obj.get('sha256'),
            size=size,
            mimetype=mimetype,
            release_ids=release_ids,
            urls=urls,
        )
        return fe
Beispiel #8
0
    def parse_record(self, row):

        request = row['request']
        fatcat = request.get('fatcat')
        file_meta = row['file_meta']

        # identify release by fatcat ident, or extid lookup, or biblio-glutton match
        release_ident = None
        if fatcat and fatcat.get('release_ident'):
            release_ident = fatcat.get('release_ident')
        elif request.get('ext_ids'):
            # if no fatcat ident, try extids
            for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv'):
                extid = request['ext_ids'].get(extid_type)
                if not extid:
                    continue
                try:
                    release = self.api.lookup_release(**{extid_type: extid})
                except fatcat_openapi_client.rest.ApiException as err:
                    if err.status == 404:
                        continue
                    elif err.status == 400:
                        self.counts['warn-extid-invalid'] += 1
                        continue
                release_ident = release.ident
                break
        if not release_ident and row.get('grobid'):
            # try biblio-glutton extracted hit
            if row['grobid'].get('fatcat_release'):
                release_ident = row['grobid']['fatcat_release'].split('_')[-1]
                self.counts['glutton-match'] += 1

        if not release_ident:
            self.counts['skip-release-not-found'] += 1
            return None

        terminal = row.get('terminal')
        if not terminal:
            # support old cdx-only ingest results
            cdx = row.get('cdx')
            if not cdx:
                # TODO: support archive.org hits?
                self.counts['skip-no-terminal'] += 1
                return None
            else:
                terminal = {
                    'terminal_url':
                    cdx['url'],
                    'terminal_dt':
                    cdx['datetime'],
                    'terminal_status_code':
                    cdx.get('status_code') or cdx.get('http_status'),
                }

        # work around old schema
        if not 'terminal_url' in terminal:
            terminal['terminal_url'] = terminal['url']
        if not 'terminal_dt' in terminal:
            terminal['terminal_dt'] = terminal['dt']
        assert len(terminal['terminal_dt']) == 14

        default_rel = self.default_link_rel
        if request.get('link_source') == 'doi':
            default_rel = 'publisher'
        default_rel = request.get('rel', default_rel)
        url = make_rel_url(terminal['terminal_url'], default_rel)

        if not url:
            self.counts['skip-url'] += 1
            return None
        wayback = "https://web.archive.org/web/{}/{}".format(
            terminal['terminal_dt'], terminal['terminal_url'])
        urls = [url, ("webarchive", wayback)]

        urls = [
            fatcat_openapi_client.FileUrl(rel=rel, url=url)
            for (rel, url) in urls
        ]

        fe = fatcat_openapi_client.FileEntity(
            md5=file_meta['md5hex'],
            sha1=file_meta['sha1hex'],
            sha256=file_meta['sha256hex'],
            size=file_meta['size_bytes'],
            mimetype=file_meta['mimetype'],
            release_ids=[release_ident],
            urls=urls,
        )
        if request.get('edit_extra'):
            fe.edit_extra = request['edit_extra']
        else:
            fe.edit_extra = dict()
        if request.get('ingest_request_source'):
            fe.edit_extra['ingest_request_source'] = request[
                'ingest_request_source']
        if request.get('link_source') and request.get('link_source_id'):
            fe.edit_extra['link_source'] = request['link_source']
            fe.edit_extra['link_source_id'] = request['link_source_id']
        if not fe.edit_extra:
            fe.edit_extra = None
        return fe
def test_access_redirect_fallback(client: Any, mocker: Any) -> None:

    with open("tests/files/elastic_fulltext_get.json") as f:
        elastic_resp = json.loads(f.read())

    es_raw = mocker.patch(
        "elasticsearch.connection.Urllib3HttpConnection.perform_request"
    )
    es_raw.side_effect = [
        (200, {}, json.dumps(elastic_resp)),
        (200, {}, json.dumps(elastic_resp)),
        (200, {}, json.dumps(elastic_resp)),
        (200, {}, json.dumps(elastic_resp)),
    ]
    fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work")
    fatcat_get_work_raw.side_effect = [
        fatcat_openapi_client.WorkEntity(
            state="active",
            ident="wwwwwwwwwwwwwwwwwwwwwwwwww",
        )
    ] * 4
    fatcat_get_work_releases_raw = mocker.patch(
        "fatcat_openapi_client.DefaultApi.get_work_releases"
    )
    fatcat_get_work_releases_raw.side_effect = [
        [
            fatcat_openapi_client.ReleaseEntity(
                ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
                ext_ids=fatcat_openapi_client.ReleaseExtIds(),
            ),
        ]
    ] * 4
    fatcat_get_release_raw = mocker.patch(
        "fatcat_openapi_client.DefaultApi.get_release"
    )
    fatcat_get_release_raw.side_effect = [
        fatcat_openapi_client.ReleaseEntity(
            state="active",
            ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
            ext_ids=fatcat_openapi_client.ReleaseExtIds(),
            files=[
                fatcat_openapi_client.FileEntity(
                    ident="ffffffffffffffffffffffffff",
                    urls=[
                        fatcat_openapi_client.FileUrl(
                            rel="web",
                            url="https://blarg.example.com",
                        ),
                        fatcat_openapi_client.FileUrl(
                            rel="webarchive",
                            url="https://web.archive.org/web/12345/https://example.com",
                        ),
                        fatcat_openapi_client.FileUrl(
                            rel="archive",
                            url="https://archive.org/download/some/thing.pdf",
                        ),
                    ],
                ),
            ],
        )
    ] * 4

    # redirects should work after API lookup, for both wayback and archive.org
    rv = client.get(
        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com",
        allow_redirects=False,
    )
    assert rv.status_code == 302
    assert (
        rv.headers["Location"]
        == "https://web.archive.org/web/12345id_/https://example.com"
    )

    rv = client.get(
        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf",
        allow_redirects=False,
    )
    assert rv.status_code == 302
    assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf"

    # wrong URLs should still not work, but display a page with helpful links
    rv = client.get(
        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
        allow_redirects=False,
    )
    assert rv.status_code == 404
    assert b"Access Location Not Found" in rv.content
    assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content

    rv = client.get(
        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf",
        allow_redirects=False,
    )
    assert rv.status_code == 404
    assert b"Access Location Not Found" in rv.content
    assert b"archive.org/download/some/thing.else.pdf" in rv.content