def test_disambiguate_authors_on_first_last_name_and_initials( inspire_app, clean_celery_session, enable_disambiguation): literature_data = faker.record("lit", with_control_number=True) literature_data.update({ "authors": [{ "full_name": "'t Hooft, Gerard", "curated_relation": True, "record": { "$ref": "http://localhost:5000/api/authors/999108" }, "ids": [{ "schema": "INSPIRE BAI", "value": "G.Hooft.2" }], }] }) literature_record = LiteratureRecord.create(literature_data) literature_data_2 = faker.record("lit", with_control_number=True) literature_data_2.update({ "authors": [{ "full_name": "'t Hooft, Gerard Antonio", "curated_relation": True, "record": { "$ref": "http://localhost:5000/api/authors/999105" }, "ids": [{ "schema": "INSPIRE BAI", "value": "G.Hooft.1" }], }] }) literature_record_2 = LiteratureRecord.create(literature_data_2) db.session.commit() def assert_lit_records_exist_in_es(): lit_record_1_from_es = InspireSearch.get_record_data_from_es( literature_record) lit_record_2_from_es = InspireSearch.get_record_data_from_es( literature_record_2) assert lit_record_1_from_es and lit_record_2_from_es retry_until_pass(assert_lit_records_exist_in_es, retry_interval=3) literature_data_3 = faker.record("lit", with_control_number=True) literature_data_3.update( {"authors": [{ "full_name": "'t Hooft, Gerard Antonio" }]}) literature_record_3 = LiteratureRecord.create(literature_data_3) db.session.commit() def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record_3) assert (literature_data_2["authors"][0]["record"] == literature_record_from_es["authors"][0]["record"]) retry_until_pass(assert_disambiguation_task, retry_interval=2)
def assert_disambiguation_on_update(): db.session.close() literature_record = LiteratureRecord.get_record(literature_record_uuid) literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert literature_record["authors"][0]["record"]["$ref"] assert literature_record_from_es["authors"][0]["record"]["$ref"]
def test_disambiguation_handle_deleted_records(inspire_app, clean_celery_session, enable_disambiguation): literature_data = faker.record("lit", with_control_number=True) literature_data.update({ "authors": [{ "full_name": "Kowalczyk, Elisabeth", "ids": [{ "schema": "INSPIRE BAI", "value": "E.Kowalczyk.1" }], }], "deleted": True, }) literature_record = LiteratureRecord.create(data=literature_data) db.session.commit() literature_record["authors"][0]["affiliations"] = [{"value": "test"}] literature_record.update(dict(literature_record)) try: db.session.commit() except Exception: assert False
def test_disambiguate_authors_create_new_author(inspire_app, clean_celery_session, enable_disambiguation): literature_data = faker.record("lit", with_control_number=True) literature_data.update({ "authors": [{ "full_name": "Michal Kowal", "affiliations": [{ "value": "Warsaw U." }] }] }) literature_record = LiteratureRecord.create(data=literature_data) db.session.commit() def assert_lit_records_exist_in_es(): lit_record_1_from_es = InspireSearch.get_record_data_from_es( literature_record) assert lit_record_1_from_es retry_until_pass(assert_lit_records_exist_in_es, retry_interval=3) def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) author_record_from_es = AuthorsSearch().query_from_iq("").execute() assert author_record_from_es.hits[0].name["value"] == "Michal Kowal" assert (literature_record_from_es["authors"][0]["recid"] == author_record_from_es.hits[0].control_number) retry_until_pass(assert_disambiguation_task)
def test_disambiguate_authors_create_two_author_with_same_name( inspire_app, clean_celery_session, enable_disambiguation): literature_data = faker.record("lit", with_control_number=True) literature_data.update({ "authors": [{ "full_name": "Michal Kowal" }, { "full_name": "Michal Kowal" }] }) literature_record = LiteratureRecord.create(data=literature_data) db.session.commit() def assert_lit_records_exist_in_es(): lit_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert lit_record_from_es retry_until_pass(assert_lit_records_exist_in_es, retry_interval=3) def assert_disambiguation_task(): author_records_from_es = AuthorsSearch().query_from_iq("").execute() assert len(author_records_from_es.hits) == 2 retry_until_pass(assert_disambiguation_task)
def get_resolved_references_by_control_number(self, data): data = force_list(data) from inspirehep.records.api.literature import LiteratureRecord resolved_records = LiteratureRecord.get_es_linked_references(data) return {record["control_number"]: record.dumps() for record in resolved_records}
def link_signature_to_author(signature_data, author_control_number): """Adds record/$ref of the given author to the given signature. Args: author_control_number (int): The control number of the author to which we want to link. signature_data (list): List containing 2 elements: the publication_id and the signature uuid. Returns: dict: The signature data from the publication with the linked author. """ record = LiteratureRecord.get_record_by_pid_value( signature_data["publication_id"]) signature = next( (author for author in record.get("authors") if author.get("uuid") == signature_data["signature_uuid"]), None, ) if not signature or ("record" in signature and signature.get("curated_relation")): return None if signature.get("curated_relation") and "record" not in signature: signature["curated_relation"] = False new_author_record = get_record_ref(author_control_number, "authors") if new_author_record == signature.get("record"): # no changes, avoid creating a new useless version of the record return None signature["record"] = new_author_record record.update(dict(record)) return signature
def test_signature_linked_by_disambiguation_has_correct_facet_author_name( inspire_app, celery_app_with_context, celery_session_worker): data = faker.record("lit") data["authors"] = [{ "full_name": "Doe, John", "uuid": "94fc2b0a-dc17-42c2-bae3-ca0024079e51" }] record = LiteratureRecord.create(data) db.session.commit() clusters = [{ "signatures": [{ "publication_id": record["control_number"], "signature_uuid": "94fc2b0a-dc17-42c2-bae3-ca0024079e51", }], "authors": [], }] disambiguate_signatures(clusters) author_pids = PersistentIdentifier.query.filter_by(pid_type="aut").all() assert len(author_pids) == 1 pid_value = author_pids[0].pid_value author = AuthorsRecord.get_record_by_pid_value(pid_value) author_control_number = author.pop("control_number") expected_facet_author_name = [f"{author_control_number}_John Doe"] expected_record_ref = f"http://localhost:5000/api/authors/{pid_value}" steps = [ { "step": current_search.flush_and_refresh, "args": ["records-hep"] }, { "step": es_search, "args": ["records-hep"], "expected_result": { "expected_key": "hits.total.value", "expected_result": 1, }, }, { "expected_key": "hits.hits[0]._source.facet_author_name", "expected_result": expected_facet_author_name, }, { "expected_key": "hits.hits[0]._source.authors[0].record.$ref", "expected_result": expected_record_ref, }, ] retry_until_matched(steps)
def test_disambiguation_doesnt_assign_bai_when_already_in_author( inspire_app, clean_celery_session, enable_disambiguation): author_data = faker.record("aut", with_control_number=True) author_data.update({ "name": { "value": "Brian Gross" }, "ids": [{ "schema": "INSPIRE BAI", "value": "J.M.Maldacena.1" }], "email_addresses": [{ "current": True, "value": "*****@*****.**" }], }) author_record = InspireRecord.create(author_data) db.session.commit() def assert_authors_records_exist_in_es(): author_record_from_es = InspireSearch.get_record_data_from_es( author_record) assert author_record_from_es retry_until_pass(assert_authors_records_exist_in_es) literature_data = faker.record("lit", with_control_number=True) literature_data.update({ "authors": [{ "full_name": "Brian Gross", "ids": [{ "schema": "INSPIRE BAI", "value": "A.Test.1" }], "emails": ["*****@*****.**"], }] }) literature_record = LiteratureRecord.create(literature_data) db.session.commit() def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert { "schema": "INSPIRE BAI", "value": "J.M.Maldacena.1", } in literature_record_from_es["authors"][0]["ids"] retry_until_pass(assert_disambiguation_task, retry_interval=2)
def test_disambiguation_on_record_update_unambiguous_match( inspire_app, clean_celery_session, enable_disambiguation): literature_data = faker.record("lit", with_control_number=True) literature_data.update({ "authors": [{ "full_name": "Kowalczyk, Elisabeth", "ids": [{ "schema": "INSPIRE BAI", "value": "E.Kowalczyk.1" }], }] }) literature_record = LiteratureRecord.create(data=literature_data) db.session.commit() def assert_first_disambiguation_no_match(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI") retry_until_pass(assert_first_disambiguation_no_match, retry_interval=2) old_bai = get_values_for_schema(literature_record["authors"][0]["ids"], "INSPIRE BAI")[0] db.session.expire_all() lit_record = InspireRecord.get_record(literature_record.id) lit_record["authors"][0]["emails"] = ["test.test@com"] lit_record.update(dict(lit_record)) db.session.commit() def assert_disambiguation_on_record_update(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert (get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI")[0] == old_bai) retry_until_pass(assert_disambiguation_on_record_update, retry_interval=2)
def test_signature_linked_by_disambiguation_has_correct_facet_author_name( inspire_app, clean_celery_session): data = faker.record("lit") data["authors"] = [{ "full_name": "Doe, John", "uuid": "94fc2b0a-dc17-42c2-bae3-ca0024079e51" }] record = LiteratureRecord.create(data) db.session.commit() clusters = [{ "signatures": [{ "publication_id": record["control_number"], "signature_uuid": "94fc2b0a-dc17-42c2-bae3-ca0024079e51", }], "authors": [], }] disambiguate_signatures(clusters) author_pids = PersistentIdentifier.query.filter_by(pid_type="aut").all() assert len(author_pids) == 1 pid_value = author_pids[0].pid_value author = AuthorsRecord.get_record_by_pid_value(pid_value) author_control_number = author.pop("control_number") expected_facet_author_name = [f"{author_control_number}_John Doe"] expected_record_ref = f"http://localhost:5000/api/authors/{pid_value}" def assert_references(): current_search.flush_and_refresh("records-hep") record_from_es = InspireSearch.get_record_data_from_es(record) assert expected_facet_author_name == record_from_es[ "facet_author_name"] assert expected_record_ref == record_from_es["authors"][0]["record"][ "$ref"] retry_until_pass(assert_references, retry_interval=2)
def test_disambiguation_on_record_update_ambiguous_match( inspire_app, clean_celery_session, enable_disambiguation): literature_data = faker.record("lit", with_control_number=True) literature_data.update({ "authors": [{ "full_name": "Kowal, Michal", "ids": [{ "schema": "INSPIRE BAI", "value": "J.M.Maldacena.1" }], "affiliations": [{ "value": "Warsaw U." }], "record": { "$ref": "http://localhost:5000/api/authors/999101" }, "curated_relation": True, }] }) literature_record = LiteratureRecord.create(data=literature_data) literature_data_2 = faker.record("lit", with_control_number=True) literature_data_2.update({ "authors": [{ "full_name": "Kowal, Michal", "ids": [{ "schema": "INSPIRE BAI", "value": "J.M.Maldacena.2" }], "record": { "$ref": "http://localhost:5000/api/authors/999102" }, "curated_relation": True, }] }) literature_record_2 = LiteratureRecord.create(data=literature_data_2) db.session.commit() def assert_authors_records_exist_in_es(): lit_record_from_es = InspireSearch.get_record_data_from_es( literature_record) lit_record_from_es_2 = InspireSearch.get_record_data_from_es( literature_record_2) assert lit_record_from_es assert lit_record_from_es_2 retry_until_pass(assert_authors_records_exist_in_es) literature_data_3 = faker.record("lit", with_control_number=True) literature_data_3.update({"authors": [{"full_name": "Kowal, Michal"}]}) literature_record_3 = LiteratureRecord.create(data=literature_data_3) db.session.commit() def assert_first_disambiguation_no_match(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record_3) assert get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI") assert (literature_record_from_es["authors"][0]["ids"] != literature_record["authors"][0]["ids"]) assert (literature_record_from_es["authors"][0]["ids"] != literature_record_2["authors"][0]["ids"]) retry_until_pass(assert_first_disambiguation_no_match, retry_interval=2) db.session.expire_all() lit_record = InspireRecord.get_record(literature_record_3.id) lit_record["authors"][0]["affiliations"] = [{"value": "CERN"}] lit_record.update(dict(lit_record)) db.session.commit() def assert_disambiguation_on_record_update(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record_3) assert (literature_record_from_es["authors"][0]["ids"] == lit_record["authors"][0]["ids"]) assert (literature_record_from_es["authors"][0]["record"] == lit_record["authors"][0]["record"]) retry_until_pass(assert_disambiguation_on_record_update, retry_interval=2)
def build_seminar(self, data) -> dict: builder = SeminarBuilder() builder.set_title(title=data.get("name")) builder.add_inspire_categories(data.get("field_of_interest", [])) builder.add_public_note(value=data.get("additional_info", "")) builder.add_series( name=data.get("series_name"), number=data.get("series_number") ) timezone = data.get("timezone") builder.set_timezone(timezone) start_datetime = get_value(data, "dates[0]") start_datetime_utc = local_form_datetime_to_iso_utc(start_datetime, timezone) builder.set_start_datetime(start_datetime_utc) end_datetime = get_value(data, "dates[1]") end_datetime_utc = local_form_datetime_to_iso_utc(end_datetime, timezone) builder.set_end_datetime(end_datetime_utc) address = data.get("address") if address: builder.set_address( cities=[address.get("city")], state=address.get("state"), place_name=address.get("venue"), country_code=country_name_to_code(address.get("country")), ) abstract = data.get("abstract") if abstract: builder.set_abstract(value=abstract) captioned = data.get("captioned") if captioned: builder.set_captioned(captioned) for contact in data.get("contacts", []): builder.add_contact(**contact) for speaker in data.get("speakers", []): name = speaker.get("name") record = speaker.get("record") affiliation_value = speaker.get("affiliation") affiliation_record = speaker.get("affiliation_record") affiliation = {} if affiliation_value: affiliation["value"] = affiliation_value if affiliation_record: affiliation["record"] = affiliation_record affiliations = [affiliation] if affiliation else None builder.add_speaker(name=name, record=record, affiliations=affiliations) for url in data.get("material_urls", []): builder.add_material_url(**url) for url in data.get("join_urls", []): builder.add_join_url(**url) for website in data.get("websites", []): builder.add_url(website) for keyword in data.get("keywords", []): builder.add_keyword(value=keyword) for literature_record_pid in data.get("literature_records", []): try: LiteratureRecord.get_record_by_pid_value(literature_record_pid) except PIDDoesNotExistError: raise InvalidDataError( f"{literature_record_pid} is not a valid literature record." ) record = { "$ref": f"{get_inspirehep_url()}/api/literature/{literature_record_pid}" } builder.add_literature_record(record=record) builder.record["$schema"] = url_for( "invenio_jsonschemas.get_schema", schema_path="records/seminars.json", _external=True, ) return builder.record
def test_disambiguation_races_assign(override_config, inspire_app, clean_celery_session, enable_disambiguation): cataloger = create_user(role="cataloger") with override_config(FEATURE_FLAG_ENABLE_BAI_PROVIDER=True, FEATURE_FLAG_ENABLE_BAI_CREATION=True): author_record_data = faker.record("aut") author_record_data.update({ "name": { "value": "Michael F. A'Hearn" }, "ids": [{ "schema": "INSPIRE BAI", "value": "M.F.A.Hearn.1" }], }) author_record = AuthorsRecord.create(author_record_data) lit_data = faker.record("lit") lit_data.update({ "authors": [{ "ids": [{ "value": "M.F.A.Hearn.1", "schema": "INSPIRE BAI" }], "uuid": "ce061c1e-866a-422d-9982-652183bae814", "full_name": "A'Hearn, M.F.", "signature_block": "HARNm", "curated_relation": True, "record": author_record["self"], }] }) lit_record = LiteratureRecord.create(lit_data) db.session.commit() with inspire_app.test_client() as client: login_user_via_session(client, email=cataloger.email) client.post( "/api/assign/author", data=orjson.dumps({ "literature_recids": [lit_record["control_number"]], "from_author_recid": author_record["control_number"], }), content_type="application/json", ) def assert_disambiguation_on_record_update(): literature_record_from_es = InspireSearch.get_record_data_from_es( lit_record) assert (get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI")[0] != "M.F.A.Hearn.1") retry_until_pass(assert_disambiguation_on_record_update, retry_interval=2)
def test_disambiguate_many_authors_runs_after_record_creation( inspire_app, clean_celery_session, enable_disambiguation): author_1 = faker.record("aut", with_control_number=True) author_1.update({ "name": { "value": "Brian Gross" }, "ids": [ { "schema": "INSPIRE ID", "value": "INSPIRE-00304313" }, { "schema": "INSPIRE BAI", "value": "J.M.Maldacena.1" }, ], "email_addresses": [{ "current": True, "value": "*****@*****.**" }], }) author_2 = faker.record("aut", with_control_number=True) author_2.update({ "name": { "value": "Donald Matthews" }, "ids": [{ "schema": "INSPIRE BAI", "value": "H.Khalfoun.1" }], "email_addresses": [ { "current": True, "value": "*****@*****.**" }, { "current": True, "value": "*****@*****.**" }, ], }) author_record_1 = InspireRecord.create(author_1) author_record_2 = InspireRecord.create(author_2) db.session.commit() def assert_authors_records_exist_in_es(): author_record_1_from_es = InspireSearch.get_record_data_from_es( author_record_1) author_record_2_from_es = InspireSearch.get_record_data_from_es( author_record_2) assert author_record_1_from_es and author_record_2_from_es retry_until_pass(assert_authors_records_exist_in_es) literature_data = faker.record("lit", with_control_number=True) literature_data.update({ "authors": [ { "full_name": "Brian Gross", "ids": [ { "schema": "INSPIRE ID", "value": "INSPIRE-00304313" }, { "schema": "INSPIRE BAI", "value": "J.M.Maldacena.1" }, ], "emails": ["*****@*****.**"], }, { "full_name": "Donald Matthews", "ids": [{ "schema": "INSPIRE BAI", "value": "H.Khalfoun.1" }], "emails": ["*****@*****.**", "*****@*****.**"], }, ] }) literature_record = LiteratureRecord.create(literature_data) db.session.commit() def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) literature_record_from_es_authors = literature_record_from_es.get( "authors") assert (str(author_1["control_number"]) in literature_record_from_es_authors[0]["record"]["$ref"]) assert (str(author_2["control_number"]) in literature_record_from_es_authors[1]["record"]["$ref"]) retry_until_pass(assert_disambiguation_task, retry_interval=2)
def test_disambiguation_runs_after_lit_record_update(inspire_app, clean_celery_session, enable_disambiguation): author_data = faker.record("aut") author_data.update({ "control_number": 1, "name": { "value": "Brian Gross" }, "ids": [{ "schema": "INSPIRE BAI", "value": "J.M.Maldacena.1" }], "email_addresses": [{ "current": True, "value": "*****@*****.**" }], }) author_record = InspireRecord.create(author_data) author_data_2 = faker.record("aut") author_data_2.update({ "control_number": 2, "name": { "value": "Test Author" }, "email_addresses": [{ "current": True, "value": "*****@*****.**" }], }) author_record_2 = InspireRecord.create(author_data_2) author_data_3 = faker.record("aut") author_data_3.update({ "control_number": 3, "name": { "value": "Another Author" }, "email_addresses": [ { "current": True, "value": "*****@*****.**" }, { "current": True, "hidden": True, "value": "*****@*****.**" }, ], }) author_record_3 = InspireRecord.create(author_data_3) db.session.commit() def assert_authors_records_exist_in_es(): author_record_from_es = InspireSearch.get_record_data_from_es( author_record) author_2_from_es = InspireSearch.get_record_data_from_es( author_record_2) author_3_from_es = InspireSearch.get_record_data_from_es( author_record_3) assert author_record_from_es assert author_2_from_es assert author_3_from_es retry_until_pass(assert_authors_records_exist_in_es, retry_interval=5) literature_data = faker.record("lit") literature_data.update({ "control_number": 4, "authors": [{ "full_name": "Brian Gross", "ids": [{ "schema": "INSPIRE BAI", "value": "J.M.Maldacena.1" }], "emails": ["*****@*****.**"], "uuid": "798d9afe-d3c2-479e-b384-f0aee2573076", }], }) literature_record = LiteratureRecord.create(literature_data) literature_record_uuid = literature_record.id db.session.commit() def assert_disambiguation_on_update(): db.session.close() literature_record = LiteratureRecord.get_record(literature_record_uuid) literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert literature_record["authors"][0]["record"]["$ref"] assert literature_record_from_es["authors"][0]["record"]["$ref"] retry_until_pass(assert_disambiguation_on_update, retry_interval=5) literature_record = LiteratureRecord.get_record(literature_record_uuid) literature_record["authors"].append({ "full_name": "Test Author", "emails": ["*****@*****.**"] }) literature_record.update(dict(literature_record)) db.session.commit() def assert_disambiguation_on_update(): db.session.close() literature_record = LiteratureRecord.get_record(literature_record_uuid) literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert literature_record["authors"][0]["record"]["$ref"] assert literature_record["authors"][1]["record"]["$ref"] assert literature_record_from_es["authors"][0]["record"]["$ref"] assert literature_record_from_es["authors"][1]["record"]["$ref"] retry_until_pass(assert_disambiguation_on_update, retry_interval=5)
def test_disambiguate_authors_doesnt_match_when_author_is_ambiguous( inspire_app, clean_celery_session, enable_disambiguation): author_1 = faker.record("aut", with_control_number=True) author_1.update({ "name": { "value": "Brian Gross" }, "ids": [ { "schema": "INSPIRE ID", "value": "INSPIRE-00304313" }, { "schema": "INSPIRE BAI", "value": "J.M.Maldacena.2" }, ], "email_addresses": [{ "current": True, "value": "*****@*****.**" }], "control_number": 90_676_330, }) author_2 = faker.record("aut", with_control_number=True) author_2.update({ "name": { "value": "Brian Gross" }, "ids": [ { "schema": "INSPIRE ID", "value": "INSPIRE-00300003" }, { "schema": "INSPIRE BAI", "value": "J.M.Maldacena.1" }, ], "email_addresses": [{ "current": True, "value": "*****@*****.**" }], "control_number": 90_676_331, }) author_record_1 = InspireRecord.create(author_1) author_record_2 = InspireRecord.create(author_2) db.session.commit() def assert_authors_records_exist_in_es(): author_record_from_es = InspireSearch.get_record_data_from_es( author_record_1) author_2_from_es = InspireSearch.get_record_data_from_es( author_record_2) assert author_record_from_es assert author_2_from_es retry_until_pass(assert_authors_records_exist_in_es, retry_interval=2) authors = [{"full_name": "Brian Gross", "emails": ["*****@*****.**"]}] literature_data = faker.record("lit", with_control_number=True) literature_data.update({"authors": authors}) literature_record = LiteratureRecord.create(literature_data) db.session.commit() def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) # new author is created assert (literature_record_from_es["authors"][0].get("record") != "http://localhost:5000/api/authors/90676330") assert (literature_record_from_es["authors"][0].get("record") != "http://localhost:5000/api/authors/90676331") retry_until_pass(assert_disambiguation_task, retry_interval=5)