def assert_disambiguation_cli(): records = LiteratureSearch().get_records(record_uuids).execute() for record in records: for author in record.authors: assert "record" in author record_not_disambiguated = (LiteratureSearch().get_records( [record_that_shouldnt_be_disambiguated_uuid]).execute()) assert "record" not in record_not_disambiguated[0]["authors"][0]
def test_gracefully_handle_records_updating_in_wrong_order( inspire_app, clean_celery_session): # We want to run indexing in weird order, so disable auto indexing models_committed.disconnect(index_after_commit) cited_record = LiteratureRecord.create(data=faker.record("lit")) record_data = faker.record( "lit", literature_citations=[cited_record.control_number]) record = LiteratureRecord.create(data=record_data) db.session.commit() record = LiteratureRecord.get_record_by_pid_value(record.control_number) index_record(record.id, record.model.versions[-1].version_id) assert LiteratureSearch().get_source( cited_record.id)["citation_count"] == 1 data = dict(record) del data["references"] record.update(data) db.session.commit() record = LiteratureRecord.get_record_by_pid_value(record.control_number) data = dict(record) data["titles"][0] = {"title": "New Title"} record.update(data) db.session.commit() record = LiteratureRecord.get_record_by_pid_value(record.control_number) index_record(record.id, record.model.versions[-1].version_id) record = LiteratureRecord.get_record_by_pid_value(record.control_number) assert LiteratureSearch().get_source( cited_record.id)["citation_count"] == 1 assert LiteratureSearch().get_source(record.id)["titles"] == [{ "title": "New Title" }] index_record(record.id, record.model.versions[-2].version_id) assert LiteratureSearch().get_source( cited_record.id)["citation_count"] == 0 assert LiteratureSearch().get_source(record.id)["titles"] == [{ "title": "New Title" }] models_committed.connect(index_after_commit)
def test_get_search_with_source_with_LiteratureSearch_instance_with_defined_headers( inspire_app): config = { "LITERATURE_SOURCE_INCLUDES_BY_CONTENT_TYPE": { "application/vnd+inspire.record.ui+json": ["title", "description"] }, "LITERATURE_SOURCE_EXCLUDES_BY_CONTENT_TYPE": { "application/vnd+inspire.record.ui+json": ["excludes_with_includes_looks_stupid"], "application/bibtex": ["control_number"], }, } headers = {"Accept": "application/vnd+inspire.record.ui+json"} with override_config(**config), current_app.test_request_context( headers=headers): search = LiteratureSearch() search = get_search_with_source(search) expected_source_includes = ["title", "description"] expected_source_excludes = ["excludes_with_includes_looks_stupid"] search_to_dict = search.to_dict() search_source = search_to_dict["_source"] assert expected_source_includes == search_source["includes"] assert expected_source_excludes == search_source["excludes"]
def assert_record_in_es(): current_search.flush_and_refresh("*") record_lit_es = (LiteratureSearch().get_record(str( rec.id)).execute().hits.hits[0]) document = record_lit_es._source["documents"][0] assert "attachment" in document assert "text" not in document # pipeline should remove it
def _get_all_not_disambiguated_records_search(): query = { "query": { "bool": { "must": [ { "nested": { "path": "authors", "query": { "bool": { "must_not": { "exists": { "field": "authors.record.$ref" } } } }, } }, { "match": { "_collections": "Literature" } }, ] } } } search_obj = (LiteratureSearch().from_dict(query).params( track_total_hits=True, _source={}, size=1000, scroll="60m")) return search_obj
def test_literature_journal_title_search_is_case_insensitive(inspire_app): record1 = create_record( "lit", data={ "publication_info": [{ "year": 2017, "artid": "020", "page_start": "020", "journal_title": "JHEP", "journal_record": { "$ref": "https://inspirebeta.net/api/journals/1213103" }, "journal_volume": "10", }], }, ) record2 = create_record( "lit", data={ "publication_info": [{ "year": 2017, "artid": "021", "page_start": "021", "journal_title": "JHEP", "journal_volume": "10", }], }, ) result_lowercase = LiteratureSearch().query_from_iq("j jhep").execute() result_uppercase = LiteratureSearch().query_from_iq("j JHEP").execute() assert result_lowercase assert result_uppercase hits_lowercase = result_lowercase["hits"]["hits"] hits_uppercase = result_uppercase["hits"]["hits"] result_lowercase_found_record_ids = [hit._id for hit in hits_lowercase] result_uppercase_found_record_ids = [hit._id for hit in hits_uppercase] assert len(result_lowercase_found_record_ids) == 2 assert len(result_uppercase_found_record_ids) == 2 assert str(record1.id) in result_lowercase_found_record_ids assert str(record2.id) in result_lowercase_found_record_ids assert str(record1.id) in result_uppercase_found_record_ids assert str(record2.id) in result_uppercase_found_record_ids
def assert_all_records_in_es(): literature_records_from_es = list(LiteratureSearch().query_from_iq( query_string= f"publication_info.journal_record.$ref: {journal_record_reference}" ).scan()) journal_record_from_es = InspireSearch.get_record_data_from_es(journal) assert len(literature_records_from_es) == 11 and journal_record_from_es
def assert_update_in_es(): current_search.flush_and_refresh("*") record_lit_es = (LiteratureSearch().get_record(str( record.id)).execute().hits.hits[0]) assert "new_doc.pdf" == record_lit_es._source["documents"][0][ "key"] assert (record_first_attachment != record_lit_es._source["documents"][0]["attachment"])
def test_get_search_with_source_with_fields_query_param_and_wrong_formats( inspire_app): with current_app.test_request_context("?fields=authors,ids&format=bibtex"): search = LiteratureSearch() with pytest.raises(FieldsParamForbidden): get_search_with_source(search) with current_app.test_request_context( "?fields=authors,ids&format=latex-eu"): search = LiteratureSearch() with pytest.raises(FieldsParamForbidden): get_search_with_source(search) with current_app.test_request_context( "?fields=authors,ids&format=latex-us"): search = LiteratureSearch() with pytest.raises(FieldsParamForbidden): get_search_with_source(search)
def test_get_search_with_source_with_fields_query_param(inspire_app): with current_app.test_request_context("?fields=authors,ids"): search = LiteratureSearch() search = get_search_with_source(search) expected_search_to_dict_source = { "includes": ["authors", "ids", "control_number", "_updated", "_created"] } search_to_dict = search.to_dict() assert expected_search_to_dict_source == search_to_dict["_source"]
def test_literature_get_records_by_pids_returns_correct_record(inspire_app): record1 = create_record("lit") record1_control_number = record1["control_number"] record2 = create_record("lit") record2_control_number = record2["control_number"] expected_control_numbers = [record1_control_number, record2_control_number] result = LiteratureSearch().get_records_by_pids([("lit", record1_control_number)]) assert len(result) == 1 assert (json.loads( result[0]._ui_display)["control_number"] == record1["control_number"]) result = LiteratureSearch().get_records_by_pids([ ("lit", record1_control_number), ("lit", record2_control_number) ]) assert len(result) == len(expected_control_numbers) for rec in result: assert rec.to_dict()["control_number"] in expected_control_numbers
def query_report_number(report_number): query = Q("match", report_numbers__value__fuzzy=report_number) source = ["control_number"] results = LiteratureSearch().query(query).source(source).execute() if len(results.hits) == 1: control_number = results.hits[0]["control_number"] return get_record_for_pid_or_none( "lit", control_number, ) return None
def get_literature_recids_for_orcid(orcid): """Return the Literature recids that were claimed by an ORCiD. We record the fact that the Author record X has claimed the Literature record Y by storing in Y an author object with a ``$ref`` pointing to X and the key ``curated_relation`` set to ``True``. Therefore this method first searches the DB for the Author records for the one containing the given ORCiD, and then uses its recid to search in ES for the Literature records that satisfy the above property. Args: orcid (str): the ORCiD. Return: list(int): the recids of the Literature records that were claimed by that ORCiD. """ orcid_object = f'[{{"schema": "ORCID", "value": "{orcid}"}}]' # this first query is written in a way that can use the index on (json -> ids) author_rec_uuid = ( db.session.query(RecordMetadata.id) .filter(type_coerce(RecordMetadata.json, JSONB)["ids"].contains(orcid_object)) .one() .id ) author_record = ( db.session.query(PersistentIdentifier) .filter( PersistentIdentifier.object_type == "rec", PersistentIdentifier.object_uuid == author_rec_uuid, PersistentIdentifier.pid_type == "aut", ) .one() ) author_recid = ( author_record.pid_value if not author_record.is_redirected() else InspireRedirect.get_redirect(author_record).pid_value ) query = Q("match", authors__curated_relation=True) & Q( "match", **{"authors.record.$ref": author_recid} ) search_by_curated_author = ( LiteratureSearch() .query("nested", path="authors", query=query) .params(_source=["control_number"], size=9999) ) return [el["control_number"] for el in search_by_curated_author]
def test_migrate_from_mirror_removes_record_from_es(inspire_app, datadir): data = orjson.loads((datadir / "dummy_record.json").read_text()) create_record("lit", data=data) expected_record_lit_es_len = 1 record_lit_uuid = LiteratureRecord.get_uuid_from_pid_value(12345) record_lit_es = LiteratureSearch().get_record(str(record_lit_uuid)).execute().hits record_lit_es_len = len(record_lit_es) assert expected_record_lit_es_len == record_lit_es_len record_deleted_fixture_path = pkg_resources.resource_filename( __name__, os.path.join("fixtures", "dummy_deleted.xml") ) migrate_from_file(record_deleted_fixture_path) current_search.flush_and_refresh("records-hep") expected_record_lit_es_len = 0 record_lit_uuid = LiteratureRecord.get_uuid_from_pid_value(12345) record_lit_es = LiteratureSearch().get_record(str(record_lit_uuid)).execute().hits record_lit_es_len = len(record_lit_es) assert expected_record_lit_es_len == record_lit_es_len
def test_reindex_one_type_of_record(inspire_app, cli): record_lit = create_record_factory("lit") create_record_factory("aut") cli.invoke(["index", "reindex", "-p", "lit"]) current_search.flush_and_refresh("*") expected_aut_len = 0 results_lit_uuid = LiteratureSearch().execute().hits.hits[0]["_id"] results_aut_len = len(AuthorsSearch().execute().hits.hits) assert str(record_lit.id) == results_lit_uuid assert expected_aut_len == results_aut_len
def test_get_search_with_source_with_LiteratureSearch_instance_without_config( inspire_app): config = { "LITERATURE_SOURCE_INCLUDES_BY_CONTENT_TYPE": None, "LITERATURE_SOURCE_EXCLUDES_BY_CONTENT_TYPE": None, } with override_config(**config), current_app.test_request_context(): search = LiteratureSearch() search = get_search_with_source(search) search_to_dict = search.to_dict() assert "_source" not in search_to_dict
def test_get_search_with_source_with_LiteratureSearch_instance_without_config( base_app): config = { "LITERATURE_SOURCE_INCLUDES_BY_CONTENT_TYPE": None, "LITERATURE_SOURCE_EXCLUDES_BY_CONTENT_TYPE": None, } with patch.dict(base_app.config, config), base_app.test_request_context(): search = LiteratureSearch() search = get_search_with_source(search) search_to_dict = search.to_dict() assert "_source" not in search_to_dict
def test_get_search_with_source_with_fields_query_param_and_wrong_mimetype( inspire_app): with current_app.test_request_context( "?fields=authors,ids", headers={"Accept": "application/x-bibtex"}): with pytest.raises(FieldsParamForbidden): search = LiteratureSearch() get_search_with_source(search) with current_app.test_request_context( "?fields=authors,ids", headers={"Accept": "application/vnd+inspire.latex.eu+x-latex"}, ): with pytest.raises(FieldsParamForbidden): search = LiteratureSearch() get_search_with_source(search) with current_app.test_request_context( "?fields=authors,ids", headers={"Accept": "application/vnd+inspire.latex.us+x-latex"}, ): with pytest.raises(FieldsParamForbidden): search = LiteratureSearch() get_search_with_source(search)
def test_reference_convert_old_publication_info_to_new_with_exception( mock_convert_old_publication_info_to_new, inspire_app): mock_convert_old_publication_info_to_new.side_effect = Exception() reference = { "reference": { "publication_info": { "journal_title": "JHEP", "journal_volume": "06", "page_start": "131", "year": 2018, } } } result = LiteratureSearch().convert_old_publication_info_to_new(reference) assert reference == result
def test_cli_reindex_deleted_and_redirected_records(inspire_app, cli): redirected = create_record("lit") new_record = create_record("lit") deleted = create_record("lit") # disable signals so re-indexing won't run automatically after record update models_committed.disconnect(index_after_commit) # redirect one record new_record_data = dict(new_record) new_record_data["deleted_records"] = [redirected["self"]] new_record.update(new_record_data) # delete one record deleted.delete() # re-enable signals models_committed.connect(index_after_commit) # check if deleted and redirected were left in ES current_search.flush_and_refresh("*") expected_control_numbers = [ redirected.control_number, new_record.control_number, deleted.control_number, ] results = LiteratureSearch().query_from_iq("").execute() control_numbers_from_es = [x.control_number for x in results.hits] assert set(control_numbers_from_es) == set(expected_control_numbers) cli.invoke(["index", "reindex", "-p", "lit"]) current_search.flush_and_refresh("*") expected_control_numbers = [new_record.control_number] results = LiteratureSearch().query_from_iq("").execute() control_numbers_from_es = [x.control_number for x in results.hits] assert set(control_numbers_from_es) == set(expected_control_numbers)
def test_migrate_from_mirror_doesnt_index_deleted_records(inspire_app): record_fixture_path = pkg_resources.resource_filename( __name__, os.path.join("fixtures", "dummy.xml")) record_fixture_path_deleted = pkg_resources.resource_filename( __name__, os.path.join("fixtures", "deleted_record.xml")) migrate_from_file(record_fixture_path) migrate_from_file(record_fixture_path_deleted) current_search.flush_and_refresh("records-hep") expected_record_lit_es_len = 1 record_lit_uuid = LiteratureRecord.get_uuid_from_pid_value(12345) record_lit_es = LiteratureSearch().get_record( str(record_lit_uuid)).execute().hits record_lit_es_len = len(record_lit_es) assert expected_record_lit_es_len == record_lit_es_len
def test_reindex_all_types_records(inspire_app, cli): record_lit = create_record_factory("lit") record_aut = create_record_factory("aut") record_job = create_record_factory("job") record_con = create_record_factory("con") cli.invoke(["index", "reindex", "--all"]) current_search.flush_and_refresh("*") results_lit_uuid = LiteratureSearch().execute().hits.hits[0]["_id"] results_aut_uuid = AuthorsSearch().execute().hits.hits[0]["_id"] results_con_uuid = ConferencesSearch().execute().hits.hits[0]["_id"] results_job_uuid = JobsSearch().execute().hits.hits[0]["_id"] assert str(record_lit.id) == results_lit_uuid assert str(record_aut.id) == results_aut_uuid assert str(record_con.id) == results_con_uuid assert str(record_job.id) == results_job_uuid
def test_get_search_with_source_with_LiteratureSearch_instance_with_not_defined_headers( inspire_app): config = { "LITERATURE_SOURCE_INCLUDES_BY_CONTENT_TYPE": { "application/vnd+inspire.record.ui+json": ["title", "description"] }, "LITERATURE_SOURCE_EXCLUDES_BY_CONTENT_TYPE": { "application/bibtex": ["control_number"] }, } headers = {"Accept": "application/json"} with override_config(**config), current_app.test_request_context( headers=headers): search = LiteratureSearch() search = get_search_with_source(search) search_to_dict = search.to_dict() assert "_source" not in search_to_dict
def _find_matching_author_in_lit_record(author_parsed_name, lit_recid): author_name_query = author_parsed_name.generate_es_query() author_name_query["nested"]["inner_hits"] = {} query = { "bool": { "must": [author_name_query, { "match": { "control_number": lit_recid } }] } } hits = LiteratureSearch().query(query).execute() authors_matched = hits[0].meta["inner_hits"].to_dict().get("authors") if len(hits) == 1 and len(authors_matched) == 1: author_record = authors_matched[0]["record"].to_dict() return get_recid_from_ref(author_record)
def test_return_record_for_journal_info_search_with_journal_title_with_dots_and_spaces( inspire_app, ): queries = ["Phys.Lett.B", "Phys. Lett. B"] cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 1, "document_type": ["article"], "publication_info": [{ "journal_title": "Phys.Lett.B", "journal_volume": "704", "page_start": "223", "year": 2011, }], "titles": [{ "title": "The Strongly-Interacting Light Higgs" }], } create_record( "jou", data={ "short_title": "Phys.Lett.B", "journal_title": { "title": "Phys Lett B" } }, ) create_record("lit", cited_record_json) expected_control_number = 1 for query in queries: response = LiteratureSearch().query_from_iq(query).execute() response_record_control_number = response["hits"]["hits"][0][ "_source"]["control_number"] assert expected_control_number == response_record_control_number
def find_references(references, requested_format): display_format = FORMAT_TO_SOURCE_FIELD[requested_format] ret = [] errors = [] for ref, line in references: query = ref keyword = None if re.search(r"^\d{4}[\w.&]{15}$", ref): # ads keyword = "external_system_identifiers.value" elif re.search(r".*\:\d{4}\w\w\w?", ref): keyword = "texkey" elif re.search(r".*\/\d{7}", ref): keyword = "eprint" elif re.search(r"\d{4}\.\d{4,5}", ref): keyword = "eprint" elif re.search(r"\w\.\w+\.\w", ref): keyword = "j" query = re.sub(r"\.", ",", ref) elif re.search(r"\w\-\w", ref): keyword = "r" results = ( LiteratureSearch() .query_from_iq(f"{keyword}:{query}") .params(size=2, _source=[display_format, "texkeys", "control_number"]) .execute() ) hits = results.hits.hits if len(hits) == 0: errors.append({"ref": ref, "line": line, "type": "not found"}) elif len(hits) > 1: errors.append({"ref": ref, "line": line, "type": "ambiguous"}) else: source_field = hits[0]["_source"] control_number = source_field["control_number"] texkey = getattr(source_field, "texkeys", [control_number])[0] ret.append(source_field[display_format].replace(f"{{{texkey}", f"{{{ref}")) return ret, errors
def test_migrate_from_mirror_doesnt_index_deleted_records( base_app, db, es_clear): record_fixture_path = pkg_resources.resource_filename( __name__, os.path.join("fixtures", "dummy.xml")) record_fixture_path_deleted = pkg_resources.resource_filename( __name__, os.path.join("fixtures", "deleted_record.xml")) migrate_from_file(record_fixture_path) migrate_from_file(record_fixture_path_deleted) es_clear.indices.refresh("records-hep") expected_record_lit_es_len = 1 record_lit_uuid = LiteratureRecord.get_uuid_from_pid_value(12345) with pytest.raises(PIDDoesNotExistError): LiteratureRecord.get_uuid_from_pid_value(1234) record_lit_es = LiteratureSearch().get_record( str(record_lit_uuid)).execute().hits record_lit_es_len = len(record_lit_es) assert expected_record_lit_es_len == record_lit_es_len
def populate_curated_relation(hits): author_recid = request.values.get("author", "", type=str).split("_")[0] hits_control_numbers = [hit["_source"]["control_number"] for hit in hits] nested_query = Q("match", authors__curated_relation=True) & Q( "match", **{"authors.record.$ref": author_recid} ) papers_with_author_curated = ( LiteratureSearch() .filter("terms", control_number=hits_control_numbers) .query("nested", path="authors", query=nested_query) .params(_source=["control_number"], size=9999) ) papers_with_author_curated_recids = { el["control_number"] for el in papers_with_author_curated } for hit in hits: if hit["_source"]["control_number"] in papers_with_author_curated_recids: hit["_source"]["curated_relation"] = True return hits
def literature(): return LiteratureSearch()
def test_reference_convert_old_publication_info_to_new_with_empty_reference( inspire_app, ): reference = {"reference": {"publication_info": {}}} result = LiteratureSearch().convert_old_publication_info_to_new(reference) assert reference == result