def test_cv_search_cached(inspire_app): headers = {"Accept": "text/vnd+inspire.html+html"} data = { "control_number": 637275232, "titles": [{"title": "Yet another title"}], } record = create_record("lit", data=data) models_committed.disconnect(index_after_commit) data = dict(record) data["titles"] = [{"title": "Modified title"}] record.update(data) expected_status_code = 200 expected_result = '<!DOCTYPE html><html><body> <p><b> <a href="https://localhost:5000/literature/637275232"> Yet another title </a> </b></p> <br></body></html>' with inspire_app.test_client() as client: response = client.get("/literature", headers=headers) response_status_code = response.status_code response_data = response.get_data(as_text=True).replace("\n", "") assert expected_status_code == response_status_code assert expected_result == response_data models_committed.connect(index_after_commit)
def test_index_record_manually(app, celery_app_with_context, celery_session_worker, retry_until_matched): data = faker.record("lit") rec = LiteratureRecord.create(data) models_committed.disconnect(index_after_commit) db.session.commit() models_committed.connect(index_after_commit) es.indices.refresh("records-hep") result = es.search("records-hep") assert result["hits"]["total"] == 0 rec.index() steps = [ { "step": es.indices.refresh, "args": ["records-hep"] }, { "step": es.search, "args": ["records-hep"], "expected_result": { "expected_key": "hits.total", "expected_result": 1 }, }, ] retry_until_matched(steps)
def test_process_references_in_records_reindexes_experiments_when_linked_experiments_change( app, celery_app_with_context, celery_session_worker): # disconnect this signal so records don't get indexed models_committed.disconnect(index_after_commit) experiment_data = faker.record("exp", with_control_number=True) experiment = InspireRecord.create(experiment_data) db.session.commit() experiment_control_number = experiment["control_number"] exp_ref = f"http://localhost:8000/api/experiments/{experiment_control_number}" data = faker.record("lit", with_control_number=True) data["accelerator_experiments"] = [{ "legacy_name": "LIGO", "record": { "$ref": exp_ref } }] record = InspireRecord.create(data) db.session.commit() models_committed.connect(index_after_commit) task = process_references_in_records.delay([record.id]) task.get(timeout=5) experiment_record_es = InspireSearch.get_record_data_from_es(experiment) expected_number_of_paper = 1 assert expected_number_of_paper == experiment_record_es["number_of_papers"]
def migrate_chunk(chunk, skip_files=False): models_committed.disconnect(index_after_commit) index_queue = [] try: for raw_record in chunk: with db.session.begin_nested(): record = migrate_and_insert_record( raw_record, skip_files=skip_files, ) if record: index_queue.append(create_index_op(record)) db.session.commit() finally: db.session.close() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( es, index_queue, stats_only=True, request_timeout=req_timeout, ) models_committed.connect(index_after_commit)
def migrate_chunk(chunk): models_committed.disconnect(receive_after_model_commit) current_collections.unregister_signals() index_queue = [] try: for raw_record in chunk: with db.session.begin_nested(): record = migrate_and_insert_record(raw_record) if record: index_queue.append(create_index_op(record)) db.session.commit() finally: db.session.close() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( es, index_queue, stats_only=True, request_timeout=req_timeout, ) models_committed.connect(receive_after_model_commit) current_collections.register_signals()
def test_process_references_in_records_reindexes_conferences_when_pub_info_changes( inspire_app, celery_app_with_context, celery_session_worker): # disconnect this signal so records don't get indexed models_committed.disconnect(index_after_commit) conference_data = faker.record("con", with_control_number=True) conference_record = InspireRecord.create(conference_data) conference_control_number = conference_record["control_number"] conf_ref = f"http://localhost:8000/api/conferences/{conference_control_number}" data = faker.record("lit", with_control_number=True) data["publication_info"] = [{"conference_record": {"$ref": conf_ref}}] data["document_type"] = ["conference paper"] record = InspireRecord.create(data) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) uuids = [record.id] task = process_references_in_records.delay(uuids) result = task.get(timeout=5) conference_record_es = InspireSearch.get_record_data_from_es( conference_record) expected_number_of_contributions = 1 assert (expected_number_of_contributions == conference_record_es["number_of_contributions"])
def test_model_signals(db, Todo): recorded = [] def committed(sender, changes): assert isinstance(changes, list) recorded.extend(changes) models_committed.connect(committed) todo = Todo("Awesome", "the text") db.session.add(todo) assert len(recorded) == 0 db.session.commit() assert len(recorded) == 1 assert recorded[0][0] == todo assert recorded[0][1] == "insert" del recorded[:] todo.text = "aha" db.session.commit() assert len(recorded) == 1 assert recorded[0][0] == todo assert recorded[0][1] == "update" del recorded[:] db.session.delete(todo) db.session.commit() assert len(recorded) == 1 assert recorded[0][0] == todo assert recorded[0][1] == "delete" models_committed.disconnect(committed)
def test_index_record(inspire_app, celery_app_with_context, celery_session_worker): models_committed.disconnect(index_after_commit) records = [ create_record_async("lit"), create_record_async("aut"), create_record_async("job"), create_record_async("jou"), create_record_async("exp"), create_record_async("con"), create_record_async("dat"), create_record_async("ins"), ] uuids = [record.id for record in records] task = index_records.delay(uuids) results = task.get(timeout=5) uuids = [str(uuid) for uuid in uuids] assert results == uuids for record in records: result = InspireSearch.get_record_data_from_es(record) assert record["control_number"] == result["control_number"] models_committed.connect(index_after_commit)
def create_records_from_mirror_recids(recids): """Task which migrates records Args: recids: records uuids to remigrate Returns: set: set of properly processed records uuids """ models_committed.disconnect(index_after_commit) processed_records = set() for recid in recids: try: LOGGER.info("Migrate record from mirror", recid=recid) with db.session.begin_nested(): record = migrate_record_from_mirror( LegacyRecordsMirror.query.get(recid)) except Exception: LOGGER.exception("Cannot migrate record", recid=recid) continue if record: processed_records.add(str(record.id)) else: LOGGER.warning("Record is empty", recid=recid) db.session.commit() models_committed.connect(index_after_commit) return list(processed_records)
def test_index_record_manually(inspire_app, clean_celery_session): data = faker.record("lit") rec = LiteratureRecord.create(data) models_committed.disconnect(index_after_commit) db.session.commit() models_committed.connect(index_after_commit) assert_es_hits_count(0) rec.index() assert_es_hits_count(1)
def test_index_record_manually(inspire_app, clean_celery_session): data = faker.record("lit") rec = LiteratureRecord.create(data) models_committed.disconnect(index_after_commit) db.session.commit() models_committed.connect(index_after_commit) assert_record_not_in_es(rec["control_number"]) rec.index() assert_record_in_es(rec["control_number"])
def signalling(app, changes, **kwargs): for instance, operation in changes: if instance.__tablename__ in [i.__tablename__ for i in [User]]: models_committed.disconnect(signalling) session = db.create_scoped_session() user = session.query(User).first() if user and user.username == 'signalling_test': user.username = '******' session.merge(user) session.commit() session.remove() models_committed.connect(signalling) break
def test_process_references_in_records_with_different_type_of_records_doesnt_throw_an_exception( inspire_app, celery_app_with_context, celery_session_worker): # disconnect this signal so records don't get indexed models_committed.disconnect(index_after_commit) cited_record_1 = LiteratureRecord.create(faker.record("lit")) cited_record_2 = LiteratureRecord.create(faker.record("lit")) data_citing_record_1 = faker.record( "lit", literature_citations=[cited_record_1["control_number"]]) citing_record_1 = LiteratureRecord.create(data_citing_record_1) data_citing_record_2 = faker.record( "lit", literature_citations=[cited_record_2["control_number"]]) citing_record_2 = LiteratureRecord.create(data_citing_record_2) db.session.commit() records = [ create_record_async("aut"), create_record_async("job"), create_record_async("jou"), create_record_async("exp"), create_record_async("con"), create_record_async("dat"), create_record_async("ins"), ] # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) uuids = [record.id for record in records] + [citing_record_1.id, citing_record_2.id] task = process_references_in_records.delay(uuids) results = task.get(timeout=5) uuids = [str(uuid) for uuid in uuids] assert results == uuids result_cited_record_1 = InspireSearch.get_record_data_from_es( cited_record_1) expected_result_cited_record_1_citation_count = 1 assert (expected_result_cited_record_1_citation_count == result_cited_record_1["citation_count"]) result_cited_record_2 = InspireSearch.get_record_data_from_es( cited_record_2) expected_result_cited_record_2_citation_count = 1 assert (expected_result_cited_record_2_citation_count == result_cited_record_2["citation_count"])
def test_gracefully_handle_records_updating_in_wrong_order( inspire_app, clean_celery_session): # We want to run indexing in weird order, so disable auto indexing models_committed.disconnect(index_after_commit) cited_record = LiteratureRecord.create(data=faker.record("lit")) record_data = faker.record( "lit", literature_citations=[cited_record.control_number]) record = LiteratureRecord.create(data=record_data) db.session.commit() record = LiteratureRecord.get_record_by_pid_value(record.control_number) index_record(record.id, record.model.versions[-1].version_id) assert LiteratureSearch().get_source( cited_record.id)["citation_count"] == 1 data = dict(record) del data["references"] record.update(data) db.session.commit() record = LiteratureRecord.get_record_by_pid_value(record.control_number) data = dict(record) data["titles"][0] = {"title": "New Title"} record.update(data) db.session.commit() record = LiteratureRecord.get_record_by_pid_value(record.control_number) index_record(record.id, record.model.versions[-1].version_id) record = LiteratureRecord.get_record_by_pid_value(record.control_number) assert LiteratureSearch().get_source( cited_record.id)["citation_count"] == 1 assert LiteratureSearch().get_source(record.id)["titles"] == [{ "title": "New Title" }] index_record(record.id, record.model.versions[-2].version_id) assert LiteratureSearch().get_source( cited_record.id)["citation_count"] == 0 assert LiteratureSearch().get_source(record.id)["titles"] == [{ "title": "New Title" }] models_committed.connect(index_after_commit)
def test_index_record_fulltext_manually(inspire_app, clean_celery_session, override_config, s3, datadir): metadata = {"foo": "bar"} pdf_path = os.path.join(datadir, "2206.04407.pdf") create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, pdf_path, metadata, **{"ContentType": "application/pdf"}, ) with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True, FEATURE_FLAG_ENABLE_FILES=False): data = faker.record("lit") data.update({ "documents": [{ "source": "arxiv", "fulltext": True, "filename": "new_doc.pdf", "key": KEY, "url": "http://www.africau.edu/images/default/sample.pdf", }] }) rec = LiteratureRecord.create(data) models_committed.disconnect(index_after_commit) db.session.commit() models_committed.connect(index_after_commit) assert_record_not_in_es(rec["control_number"]) rec.index_fulltext() def assert_record_in_es(): current_search.flush_and_refresh("*") record_lit_es = (LiteratureSearch().get_record(str( rec.id)).execute().hits.hits[0]) document = record_lit_es._source["documents"][0] assert "attachment" in document assert "text" not in document # pipeline should remove it retry_until_pass(assert_record_in_es, timeout=90, retry_interval=5)
def test_process_references_in_records_process_author_records( mock_batch_index, inspire_app, clean_celery_session): author_record = AuthorsRecord.create(faker.record("aut")) lit_record = LiteratureRecord.create( faker.record( "lit", data={ "authors": [{ "full_name": author_record["name"]["value"], "record": author_record["self"], }] }, )) lit_record_2 = LiteratureRecord.create( faker.record( "lit", data={ "authors": [{ "full_name": author_record["name"]["value"], "record": author_record["self"], }] }, )) db.session.commit() def assert_records_in_es(): lit_record_from_es = InspireSearch.get_record_data_from_es(lit_record) lit_record_from_es_2 = InspireSearch.get_record_data_from_es( lit_record_2) aut_record_from_es = InspireSearch.get_record_data_from_es( author_record) assert lit_record_from_es and aut_record_from_es and lit_record_from_es_2 retry_until_pass(assert_records_in_es, retry_interval=5) models_committed.disconnect(index_after_commit) author_record["name"]["value"] = "Another Name" author_record.update(dict(author_record)) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) task = process_references_in_records.delay([author_record.id]) task.get(timeout=5) assert sorted(mock_batch_index.mock_calls[0][1][0]) == sorted( [str(lit_record.id), str(lit_record_2.id)])
def migrate_chunk(chunk, broken_output=None, dry_run=False): from flask_sqlalchemy import models_committed from invenio_records.receivers import record_modification from invenio_records.tasks.index import get_record_index from invenio.base.globals import cfg from elasticsearch.helpers import bulk as es_bulk from inspirehep.modules.citations.receivers import ( catch_citations_insert, add_citation_count_on_insert_or_update, catch_citations_update ) from invenio_records.signals import before_record_index, after_record_insert models_committed.disconnect(record_modification) after_record_insert.disconnect(catch_citations_insert) before_record_index.disconnect(add_citation_count_on_insert_or_update) before_record_index.disconnect(catch_citations_update) records_to_index = [] try: for record in chunk: recid = json = None try: recid, json = create_record(record, force=True, dry_run=dry_run) index = get_record_index(json) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] before_record_index.send(recid, json=json, index=index) json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0}) records_to_index.append(json) except Exception as err: logger.error("ERROR with record {} and json {}".format(recid, json)) logger.exception(err) if broken_output: broken_output_fd = open(broken_output, "a") print(record, file=broken_output_fd) logger.info("Committing chunk") db.session.commit() logger.info("Sending chunk to elasticsearch") es_bulk(es, records_to_index, request_timeout=60) finally: models_committed.connect(record_modification) after_record_insert.connect(catch_citations_insert) before_record_index.connect(add_citation_count_on_insert_or_update) before_record_index.connect(catch_citations_update) db.session.close()
def test_process_references_in_records_process_conference_records( mock_batch_index, inspire_app, clean_celery_session): conf_record = ConferencesRecord.create( faker.record("con", data={"titles": [{ "title": "Test conference" }]})) lit_data = { "publication_info": [{ "conference_record": { "$ref": conf_record["self"]["$ref"] } }], "document_type": ["conference paper"], } lit_record = LiteratureRecord.create(faker.record("lit", data=lit_data)) lit_record_2 = LiteratureRecord.create(faker.record("lit", data=lit_data)) db.session.commit() def assert_records_in_es(): lit_record_from_es = InspireSearch.get_record_data_from_es(lit_record) lit_record_from_es_2 = InspireSearch.get_record_data_from_es( lit_record_2) aut_record_from_es = InspireSearch.get_record_data_from_es(conf_record) assert lit_record_from_es and aut_record_from_es and lit_record_from_es_2 retry_until_pass(assert_records_in_es, retry_interval=5) models_committed.disconnect(index_after_commit) conf_record["titles"] = [{"title": "Southern California Strings Seminar "}] conf_record.update(dict(conf_record)) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) task = process_references_in_records.delay([conf_record.id]) task.get(timeout=5) assert sorted(mock_batch_index.mock_calls[0][1][0]) == sorted( [lit_record.id, lit_record_2.id])
def test_process_references_in_records(inspire_app, celery_app_with_context, celery_session_worker): # disconnect this signal so records don't get indexed models_committed.disconnect(index_after_commit) cited_record_1 = LiteratureRecord.create(faker.record("lit")) cited_record_2 = LiteratureRecord.create(faker.record("lit")) data_citing_record_1 = faker.record( "lit", literature_citations=[cited_record_1["control_number"]]) citing_record_1 = LiteratureRecord.create(data_citing_record_1) data_citing_record_2 = faker.record( "lit", literature_citations=[cited_record_2["control_number"]]) citing_record_2 = LiteratureRecord.create(data_citing_record_2) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) uuids = [citing_record_1.id, citing_record_2.id] task = process_references_in_records.delay(uuids) result = task.get(timeout=5) result_cited_record_1 = InspireSearch.get_record_data_from_es( cited_record_1) expected_result_cited_record_1_citation_count = 1 assert (expected_result_cited_record_1_citation_count == result_cited_record_1["citation_count"]) result_cited_record_2 = InspireSearch.get_record_data_from_es( cited_record_2) expected_result_cited_record_2_citation_count = 1 assert (expected_result_cited_record_2_citation_count == result_cited_record_2["citation_count"])
def create_records_from_mirror_recids(self, recids): """Task which migrates records Args: recids: records uuids to remigrate Returns: set: set of properly processed records uuids """ models_committed.disconnect(index_after_commit) processed_records = set() try: for recid in recids: LOGGER.info("Migrate record from mirror", recid=recid) with db.session.begin_nested(): record = migrate_record_from_mirror( LegacyRecordsMirror.query.get(recid)) if record: processed_records.add(str(record.id)) else: LOGGER.warning("Record is empty", recid=recid) db.session.commit() except (InvalidRequestError, OperationalError, StatementError, ThreadsTimeoutError): LOGGER.exception( "Error during batch processing. Retrying.", processed_records=list(processed_records), recids=recids, ) raise except Exception: LOGGER.exception( "Got unexpected exception. Ignoring", processed_records=list(processed_records), recids=recids, ) finally: models_committed.connect(index_after_commit) return list(processed_records)
def migrate_recids_from_mirror(prod_recids, skip_files=False): models_committed.disconnect(index_after_commit) index_queue = [] for recid in prod_recids: with db.session.begin_nested(): record = migrate_record_from_mirror( LegacyRecordsMirror.query.get(recid), skip_files=skip_files, ) if record and not record.get('deleted'): index_queue.append(create_index_op(record)) db.session.commit() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( es, index_queue, stats_only=True, request_timeout=req_timeout, ) models_committed.connect(index_after_commit)
def test_cli_reindex_deleted_and_redirected_records(inspire_app, cli): redirected = create_record("lit") new_record = create_record("lit") deleted = create_record("lit") # disable signals so re-indexing won't run automatically after record update models_committed.disconnect(index_after_commit) # redirect one record new_record_data = dict(new_record) new_record_data["deleted_records"] = [redirected["self"]] new_record.update(new_record_data) # delete one record deleted.delete() # re-enable signals models_committed.connect(index_after_commit) # check if deleted and redirected were left in ES current_search.flush_and_refresh("*") expected_control_numbers = [ redirected.control_number, new_record.control_number, deleted.control_number, ] results = LiteratureSearch().query_from_iq("").execute() control_numbers_from_es = [x.control_number for x in results.hits] assert set(control_numbers_from_es) == set(expected_control_numbers) cli.invoke(["index", "reindex", "-p", "lit"]) current_search.flush_and_refresh("*") expected_control_numbers = [new_record.control_number] results = LiteratureSearch().query_from_iq("").execute() control_numbers_from_es = [x.control_number for x in results.hits] assert set(control_numbers_from_es) == set(expected_control_numbers)
def test_index_records_batch_fulltext_manually(inspire_app, clean_celery_session, override_config, s3): metadata = {"foo": "bar"} key_2 = "9bfe422f251eeaa7ec2a4dd5aebebc8a" key_3 = "e5892c4e59898346d307332354c6c7b8" create_s3_bucket(KEY) create_s3_file( current_s3_instance.get_bucket_for_file_key(KEY), KEY, "this is my data", metadata, ) create_s3_bucket(key_2) create_s3_file( current_s3_instance.get_bucket_for_file_key(key_2), key_2, "this is my data", metadata, ) create_s3_bucket(key_3) create_s3_file( current_s3_instance.get_bucket_for_file_key(key_3), key_3, "this is my data", metadata, ) with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True, FEATURE_FLAG_ENABLE_FILES=False): lit_record = LiteratureRecord.create( faker.record( "lit", data={ "documents": [{ "fulltext": True, "hidden": False, "key": KEY, "filename": "2105.15193.pdf", "url": "https://arxiv.org/pdf/2105.15193.pdf", }] }, )) lit_record_2 = LiteratureRecord.create( faker.record( "lit", data={ "documents": [{ "fulltext": True, "hidden": False, "filename": "new_doc.pdf", "key": key_2, "url": "http://www.africau.edu/images/default/sample.pdf", }] }, )) db.session.commit() def assert_records_in_es(): lit_record_from_es = LiteratureSearch.get_record_data_from_es( lit_record) lit_record_from_es_2 = LiteratureSearch.get_record_data_from_es( lit_record_2) assert lit_record_from_es and lit_record_from_es_2 retry_until_pass(assert_records_in_es, retry_interval=5) models_committed.disconnect(index_after_commit) lit_record["documents"].append( { "source": "arxiv", "fulltext": True, "filename": "another_doc.pdf", "key": key_3, "url": "http://www.africau.edu/images/default/sample.pdf", }, ) lit_record.update(dict(lit_record)) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) task = batch_index_literature_fulltext.delay( [lit_record.id, lit_record_2.id]) task.get(timeout=5) assert task.result == { "uuids": [str(lit_record.id), str(lit_record_2.id)], "success_count": 2, "failures_count": 0, "failures": [], }
def migrator(): """Command related to migrating INSPIRE data.""" logging.basicConfig() # Disable auto-indexing receiver in migration tasks models_committed.disconnect(receive_after_model_commit)
def test_process_references_in_records_process_self_citations( mock_batch_index, inspire_app, clean_celery_session, enable_self_citations): author_record = AuthorsRecord.create( faker.record( "aut", data={ "name": { "value": "'t Hooft, Gerardus", "name_variants": ["'t Hooft, Gerard", "Hooft, Gerard T."], "preferred_name": "Gerardus 't Hooft", }, "ids": [ { "value": "INSPIRE-00060582", "schema": "INSPIRE ID" }, { "value": "G.tHooft.1", "schema": "INSPIRE BAI" }, ], }, )) author_record_2 = AuthorsRecord.create( faker.record( "aut", data={ "name": { "value": "'t Hooft, Gerardus Marcus", "preferred_name": "Gerardus Marcus 't Hooft", }, "ids": [ { "value": "INSPIRE-00060583", "schema": "INSPIRE ID" }, { "value": "G.tHooft.2", "schema": "INSPIRE BAI" }, ], }, )) lit_record = LiteratureRecord.create( faker.record( "lit", data={ "authors": [{ "ids": [ { "value": "INSPIRE-00060582", "schema": "INSPIRE ID" }, { "value": "G.tHooft.1", "schema": "INSPIRE BAI" }, ], "full_name": author_record["name"]["value"], "record": author_record["self"], }] }, )) lit_record_2 = LiteratureRecord.create( faker.record( "lit", literature_citations=[lit_record["control_number"]], data={ "authors": [{ "ids": [ { "value": "INSPIRE-00060583", "schema": "INSPIRE ID" }, { "value": "G.tHooft.2", "schema": "INSPIRE BAI" }, ], "full_name": author_record_2["name"]["value"], "record": author_record_2["self"], }] }, )) db.session.commit() def assert_records_in_es(): lit_record_from_es = InspireSearch.get_record_data_from_es(lit_record) lit_record_from_es_2 = InspireSearch.get_record_data_from_es( lit_record_2) aut_record_from_es = InspireSearch.get_record_data_from_es( author_record) assert lit_record_from_es and aut_record_from_es and lit_record_from_es_2 retry_until_pass(assert_records_in_es, retry_interval=5) models_committed.disconnect(index_after_commit) lit_record["authors"].append({ "ids": [ { "value": "INSPIRE-00060583", "schema": "INSPIRE ID" }, { "value": "G.tHooft.2", "schema": "INSPIRE BAI" }, ], "full_name": author_record_2["name"]["value"], "record": author_record_2["self"], }) lit_record.update(dict(lit_record)) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) task = process_references_in_records.delay([lit_record.id]) task.get(timeout=5) assert sorted(mock_batch_index.mock_calls[0][1][0]) == sorted( [lit_record_2.id])
def migrate_chunk(chunk, broken_output=None, dry_run=False): from flask_sqlalchemy import models_committed from invenio_records.receivers import record_modification from invenio_records.tasks.index import get_record_index from invenio.base.globals import cfg from elasticsearch.helpers import bulk as es_bulk from inspirehep.modules.citations.receivers import ( catch_citations_insert, add_citation_count_on_insert_or_update, catch_citations_update ) from invenio_records.signals import before_record_index, after_record_insert models_committed.disconnect(record_modification) after_record_insert.disconnect(catch_citations_insert) before_record_index.disconnect(add_citation_count_on_insert_or_update) before_record_index.disconnect(catch_citations_update) records_to_index = [] try: for raw_record in chunk: json = None record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001']) if not dry_run: prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record try: with db.session.begin_nested(): errors, recid, json = create_record( recid, record, force=True, dry_run=dry_run, validation=True ) if dry_run: continue prod_record.valid = not errors prod_record.errors = errors index = get_record_index(json) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] before_record_index.send(recid, json=json, index=index) json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0}) records_to_index.append(json) prod_record.successful = True db.session.merge(prod_record) except Exception as err: logger.error("ERROR with record {} and json {}".format(recid, json)) logger.exception(err) if not dry_run: prod_record.successful = False db.session.merge(prod_record) logger.info("Committing chunk") db.session.commit() logger.info("Sending chunk to elasticsearch") if not dry_run: es_bulk(es, records_to_index, request_timeout=60) finally: models_committed.connect(record_modification) after_record_insert.connect(catch_citations_insert) before_record_index.connect(add_citation_count_on_insert_or_update) before_record_index.connect(catch_citations_update) db.session.close()
def test_process_references_in_records_reindexes_institutions_when_linked_institutions_change( inspire_app, celery_app_with_context, celery_session_worker): # disconnect this signal so records don't get indexed models_committed.disconnect(index_after_commit) institution_data = faker.record("ins", with_control_number=True) institution = InspireRecord.create(institution_data) institution_control_number = institution["control_number"] inst_ref = f"http://localhost:8000/api/institutions/{institution_control_number}" data = faker.record("lit", with_control_number=True) data.update({ "authors": [{ "full_name": "John Doe", "affiliations": [{ "value": "Institution", "record": { "$ref": inst_ref } }], }] }) record_authors_aff = InspireRecord.create(data) db.session.commit() data = faker.record("lit", with_control_number=True) data.update( {"thesis_info": { "institutions": [{ "record": { "$ref": inst_ref } }] }}) record_thesis_info = InspireRecord.create(data) db.session.commit() data = faker.record("lit", with_control_number=True) data.update({ "record_affiliations": [{ "record": { "$ref": inst_ref }, "value": "Institution" }] }) record_affiliations = InspireRecord.create(data) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) task = process_references_in_records.delay( [record_authors_aff.id, record_thesis_info.id, record_affiliations.id]) task.get(timeout=5) institution_record_es = InspireSearch.get_record_data_from_es(institution) expected_number_of_paper = 3 assert expected_number_of_paper == institution_record_es[ "number_of_papers"]