Beispiel #1
0
def run_elasticsearch_containers(args: argparse.Namespace) -> None:
    es_client = elasticsearch.Elasticsearch(args.fatcat_elasticsearch_url)
    es_release_index = "fatcat_release"
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line,
                                  ContainerEntity,
                                  api_client=args.api.api_client)
        if entity.state != "active":
            continue

        if args.query_stats:
            es_doc = container_to_elasticsearch(
                entity,
                stats=query_es_container_stats(
                    entity.ident,
                    es_client=es_client,
                    es_index=es_release_index,
                    merge_shadows=True,
                ),
            )
        else:
            es_doc = container_to_elasticsearch(entity)

        args.json_output.write(json.dumps(es_doc) + "\n")
Beispiel #2
0
def test_elasticsearch_file_transform():

    with open("./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json", "r") as f:
        json_str = f.read()

    fe = entity_from_json(json_str, FileEntity)

    fe.state = "active"
    es = file_to_elasticsearch(fe)

    # pylint infers type of 'fe' incorrectly for some reason (as str/bytes)
    assert es["sha1"] == fe.sha1  # pylint: disable=no-member
    assert es["sha256"] == fe.sha256  # pylint: disable=no-member
    assert es["md5"] == fe.md5  # pylint: disable=no-member
    assert es["size_bytes"] == fe.size  # pylint: disable=no-member
    assert es["mimetype"] == fe.mimetype  # pylint: disable=no-member
    assert es["in_ia"] is True

    assert "web" in es["rels"]
    assert "www.zhros.ru" in es["hosts"]
    assert "zhros.ru" in es["domains"]
    assert "archive.org" in (es["hosts"] + es["domains"])
    assert "web.archive.org" in (es["hosts"] + es["domains"])
    # old regression
    assert ".archive.org" not in (es["hosts"] + es["domains"])
Beispiel #3
0
def run_elasticsearch_changelogs(args: argparse.Namespace) -> None:
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line,
                                  ChangelogEntry,
                                  api_client=args.api.api_client)
        args.json_output.write(
            json.dumps(changelog_to_elasticsearch(entity)) + "\n")
Beispiel #4
0
def test_csl_pubmed(crossref_importer: Any) -> None:
    with open("tests/files/example_releases_pubmed19n0972.json", "r") as f:
        # multiple single lines
        for line in f:
            r = entity_from_json(line, ReleaseEntity)
            csl = release_to_csl(r)
            citeproc_csl(csl, "csl-json")
            citeproc_csl(csl, "bibtex")
            citeproc_csl(csl, "harvard1")
            citeproc_csl(csl, "harvard1", html=True)
Beispiel #5
0
def run_elasticsearch_files(args: argparse.Namespace) -> None:
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line,
                                  FileEntity,
                                  api_client=args.api.api_client)
        if entity.state != "active":
            continue
        args.json_output.write(
            json.dumps(file_to_elasticsearch(entity)) + "\n")
Beispiel #6
0
def run_citeproc_releases(args: argparse.Namespace) -> None:
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line,
                                  ReleaseEntity,
                                  api_client=args.api.api_client)
        if entity.state != "active":
            continue
        csl_json = release_to_csl(entity)
        csl_json["id"] = "release:" + (entity.ident or "unknown")
        out = citeproc_csl(csl_json, args.style, args.html)
        args.json_output.write(out + "\n")
Beispiel #7
0
def test_csl_pubmed_bibtex(crossref_importer: Any) -> None:
    with open("tests/files/example_releases_pubmed19n0972.json", "r") as f:
        r = entity_from_json(f.readline(), ReleaseEntity)
    csl = release_to_csl(r)
    print(citeproc_csl(csl, "bibtex"))
    # TODO: what's with the '`' in volume?
    assert (citeproc_csl(csl, "bibtex").strip() == """
@article{mędrela-kuder_szymura_2018, 
  title={Selected anti-health behaviours among women with osteoporosis}, 
  volume={69`}, 
  ISSN={0035-7715}, 
  DOI={10.32394/rpzh.2018.0046}, 
  abstractNote={In the prevention of osteoporosis and its treatment, it is important to prevent bone loss by reducing the occurrence of factors determining human health, which reduce the risk of osteoporosis, such as health behaviors.}, 
  number={4}, 
  journal={Roczniki Panstwowego Zakladu Higieny}, 
  author={Mędrela-Kuder and Szymura}, 
  year={2018}
  }
    """.strip())
    assert (citeproc_csl(csl, "harvard1", html=True).strip() == """
    Mędrela-Kuder and Szymura (2018) ‘Selected anti-health behaviours among women with osteoporosis’, <i>Roczniki Panstwowego Zakladu Higieny</i>, 69`(4). doi: 10.32394/rpzh.2018.0046.
    """.strip())
Beispiel #8
0
def test_elasticsearch_changelog_transform():
    ce = entity_from_json(
        open("./tests/files/changelog_3469683.json", "r").read(),
        ChangelogEntry)

    es = changelog_to_elasticsearch(ce)
    assert es["index"] == 3469683
    # len("2020-01-30T05:04:39") => 19
    assert es["timestamp"][:19] == "2020-01-30T05:04:39.738601Z"[:19]
    assert es["editor_id"] == "scmbogxw25evtcesfcab5qaboa"
    assert es["username"] == "crawl-bot"
    assert es["is_bot"] is True
    assert es["is_admin"] is True
    assert es["agent"] == "fatcat_tools.IngestFileResultImporter"

    assert es["total"] == 50
    assert es["files"] == 50
    assert es["new_files"] == 50
    assert es["created"] == 50

    assert es["releases"] == 0
    assert es["new_releases"] == 0
    assert es["updated"] == 0
    assert es["deleted"] == 0
Beispiel #9
0
def test_elasticsearch_release_from_json():
    r = entity_from_json(
        open("./tests/files/release_etodop5banbndg3faecnfm6ozi.json",
             "r").read(), ReleaseEntity)
    es = release_to_elasticsearch(r)

    assert es["subtitle"] == "Correpondence"
    assert es["ident"] == "etodop5banbndg3faecnfm6ozi"
    assert (es["container_name"] ==
            "BJOG: an International Journal of Obstetrics and Gynaecology")
    assert es["first_page"] == "1404"
    assert es["issue"] == "11"
    assert es["volume"] == "118"
    assert es["number"] is None

    assert es["preservation"] == "dark"
    assert es["is_oa"] is False
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is False
    assert es["in_dweb"] is False
    assert es["in_ia"] is False
    assert es["in_ia_sim"] is True
    assert es["in_kbart"] is True
    assert es["in_jstor"] is False

    # this release has a fileset, and no file
    r = entity_from_json(
        open("./tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json",
             "r").read(), ReleaseEntity)
    es = release_to_elasticsearch(r)

    assert es["title"] == "Jakobshavn Glacier Bed Elevation"
    assert es["ident"] == "3mssw2qnlnblbk7oqyv2dafgey"
    assert es["file_count"] == 0
    assert es["fileset_count"] == 1
    assert es["webcapture_count"] == 0

    assert es["preservation"] == "dark"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is True
    assert es["in_dweb"] is True
    assert es["in_ia"] is False
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is False
    assert es["in_jstor"] is False

    # this release has a web capture, and no file (edited the JSON to remove file)
    r = entity_from_json(
        open("./tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json",
             "r").read(), ReleaseEntity)
    es = release_to_elasticsearch(r)

    assert es["title"] == "Rethinking Personal Digital Archiving, Part 1"
    assert es["ident"] == "mjtqtuyhwfdr7j2c3l36uor7uy"
    assert es["file_count"] == 0
    assert es["fileset_count"] == 0
    assert es["webcapture_count"] == 1

    assert es["preservation"] == "bright"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is True
    assert es["in_dweb"] is False
    assert es["in_ia"] is True
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is False
    assert es["in_jstor"] is False