Exemple #1
0
def test_cv_search_cached(inspire_app):
    headers = {"Accept": "text/vnd+inspire.html+html"}
    data = {
        "control_number": 637275232,
        "titles": [{"title": "Yet another title"}],
    }
    record = create_record("lit", data=data)

    models_committed.disconnect(index_after_commit)

    data = dict(record)
    data["titles"] = [{"title": "Modified title"}]

    record.update(data)

    expected_status_code = 200
    expected_result = '<!DOCTYPE html><html><body>  <p><b>    <a href="https://localhost:5000/literature/637275232">      Yet another title    </a>  </b></p>          <br></body></html>'
    with inspire_app.test_client() as client:
        response = client.get("/literature", headers=headers)

    response_status_code = response.status_code
    response_data = response.get_data(as_text=True).replace("\n", "")
    assert expected_status_code == response_status_code
    assert expected_result == response_data

    models_committed.connect(index_after_commit)
def test_index_record_manually(app, celery_app_with_context,
                               celery_session_worker, retry_until_matched):
    data = faker.record("lit")
    rec = LiteratureRecord.create(data)
    models_committed.disconnect(index_after_commit)
    db.session.commit()
    models_committed.connect(index_after_commit)
    es.indices.refresh("records-hep")
    result = es.search("records-hep")
    assert result["hits"]["total"] == 0

    rec.index()
    steps = [
        {
            "step": es.indices.refresh,
            "args": ["records-hep"]
        },
        {
            "step": es.search,
            "args": ["records-hep"],
            "expected_result": {
                "expected_key": "hits.total",
                "expected_result": 1
            },
        },
    ]
    retry_until_matched(steps)
def test_process_references_in_records_reindexes_experiments_when_linked_experiments_change(
        app, celery_app_with_context, celery_session_worker):
    # disconnect this signal so records don't get indexed
    models_committed.disconnect(index_after_commit)

    experiment_data = faker.record("exp", with_control_number=True)
    experiment = InspireRecord.create(experiment_data)
    db.session.commit()

    experiment_control_number = experiment["control_number"]
    exp_ref = f"http://localhost:8000/api/experiments/{experiment_control_number}"

    data = faker.record("lit", with_control_number=True)

    data["accelerator_experiments"] = [{
        "legacy_name": "LIGO",
        "record": {
            "$ref": exp_ref
        }
    }]

    record = InspireRecord.create(data)
    db.session.commit()

    models_committed.connect(index_after_commit)

    task = process_references_in_records.delay([record.id])
    task.get(timeout=5)

    experiment_record_es = InspireSearch.get_record_data_from_es(experiment)
    expected_number_of_paper = 1

    assert expected_number_of_paper == experiment_record_es["number_of_papers"]
Exemple #4
0
def migrate_chunk(chunk, skip_files=False):
    models_committed.disconnect(index_after_commit)

    index_queue = []

    try:
        for raw_record in chunk:
            with db.session.begin_nested():
                record = migrate_and_insert_record(
                    raw_record,
                    skip_files=skip_files,
                )
                if record:
                    index_queue.append(create_index_op(record))
        db.session.commit()
    finally:
        db.session.close()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        es,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )

    models_committed.connect(index_after_commit)
Exemple #5
0
def migrate_chunk(chunk):
    models_committed.disconnect(receive_after_model_commit)
    current_collections.unregister_signals()

    index_queue = []

    try:
        for raw_record in chunk:
            with db.session.begin_nested():
                record = migrate_and_insert_record(raw_record)
                if record:
                    index_queue.append(create_index_op(record))
        db.session.commit()
    finally:
        db.session.close()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        es,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )

    models_committed.connect(receive_after_model_commit)
    current_collections.register_signals()
def test_process_references_in_records_reindexes_conferences_when_pub_info_changes(
        inspire_app, celery_app_with_context, celery_session_worker):
    # disconnect this signal so records don't get indexed
    models_committed.disconnect(index_after_commit)
    conference_data = faker.record("con", with_control_number=True)
    conference_record = InspireRecord.create(conference_data)
    conference_control_number = conference_record["control_number"]
    conf_ref = f"http://localhost:8000/api/conferences/{conference_control_number}"
    data = faker.record("lit", with_control_number=True)
    data["publication_info"] = [{"conference_record": {"$ref": conf_ref}}]
    data["document_type"] = ["conference paper"]
    record = InspireRecord.create(data)
    db.session.commit()

    # reconnect signal before we call process_references_in_records
    models_committed.connect(index_after_commit)

    uuids = [record.id]

    task = process_references_in_records.delay(uuids)

    result = task.get(timeout=5)

    conference_record_es = InspireSearch.get_record_data_from_es(
        conference_record)
    expected_number_of_contributions = 1

    assert (expected_number_of_contributions ==
            conference_record_es["number_of_contributions"])
Exemple #7
0
def test_model_signals(db, Todo):
    recorded = []

    def committed(sender, changes):
        assert isinstance(changes, list)
        recorded.extend(changes)

    models_committed.connect(committed)
    todo = Todo("Awesome", "the text")
    db.session.add(todo)
    assert len(recorded) == 0
    db.session.commit()
    assert len(recorded) == 1
    assert recorded[0][0] == todo
    assert recorded[0][1] == "insert"
    del recorded[:]
    todo.text = "aha"
    db.session.commit()
    assert len(recorded) == 1
    assert recorded[0][0] == todo
    assert recorded[0][1] == "update"
    del recorded[:]
    db.session.delete(todo)
    db.session.commit()
    assert len(recorded) == 1
    assert recorded[0][0] == todo
    assert recorded[0][1] == "delete"
    models_committed.disconnect(committed)
def test_index_record(inspire_app, celery_app_with_context,
                      celery_session_worker):
    models_committed.disconnect(index_after_commit)

    records = [
        create_record_async("lit"),
        create_record_async("aut"),
        create_record_async("job"),
        create_record_async("jou"),
        create_record_async("exp"),
        create_record_async("con"),
        create_record_async("dat"),
        create_record_async("ins"),
    ]

    uuids = [record.id for record in records]
    task = index_records.delay(uuids)

    results = task.get(timeout=5)

    uuids = [str(uuid) for uuid in uuids]
    assert results == uuids

    for record in records:
        result = InspireSearch.get_record_data_from_es(record)
        assert record["control_number"] == result["control_number"]
    models_committed.connect(index_after_commit)
Exemple #9
0
def create_records_from_mirror_recids(recids):
    """Task which migrates records
    Args:
        recids: records uuids to remigrate
    Returns:
         set: set of properly processed records uuids
    """
    models_committed.disconnect(index_after_commit)
    processed_records = set()
    for recid in recids:
        try:
            LOGGER.info("Migrate record from mirror", recid=recid)
            with db.session.begin_nested():
                record = migrate_record_from_mirror(
                    LegacyRecordsMirror.query.get(recid))
        except Exception:
            LOGGER.exception("Cannot migrate record", recid=recid)
            continue

        if record:
            processed_records.add(str(record.id))
        else:
            LOGGER.warning("Record is empty", recid=recid)
    db.session.commit()
    models_committed.connect(index_after_commit)

    return list(processed_records)
Exemple #10
0
def test_index_record_manually(inspire_app, clean_celery_session):
    data = faker.record("lit")
    rec = LiteratureRecord.create(data)
    models_committed.disconnect(index_after_commit)
    db.session.commit()
    models_committed.connect(index_after_commit)

    assert_es_hits_count(0)

    rec.index()

    assert_es_hits_count(1)
def test_index_record_manually(inspire_app, clean_celery_session):
    data = faker.record("lit")
    rec = LiteratureRecord.create(data)
    models_committed.disconnect(index_after_commit)
    db.session.commit()
    models_committed.connect(index_after_commit)

    assert_record_not_in_es(rec["control_number"])

    rec.index()

    assert_record_in_es(rec["control_number"])
Exemple #12
0
def signalling(app, changes, **kwargs):
    for instance, operation in changes:
        if instance.__tablename__ in [i.__tablename__ for i in [User]]:
            models_committed.disconnect(signalling)
            session = db.create_scoped_session()
            user = session.query(User).first()
            if user and user.username == 'signalling_test':
                user.username = '******'
                session.merge(user)
                session.commit()
            session.remove()
            models_committed.connect(signalling)
            break
def test_process_references_in_records_with_different_type_of_records_doesnt_throw_an_exception(
        inspire_app, celery_app_with_context, celery_session_worker):
    # disconnect this signal so records don't get indexed
    models_committed.disconnect(index_after_commit)

    cited_record_1 = LiteratureRecord.create(faker.record("lit"))
    cited_record_2 = LiteratureRecord.create(faker.record("lit"))

    data_citing_record_1 = faker.record(
        "lit", literature_citations=[cited_record_1["control_number"]])
    citing_record_1 = LiteratureRecord.create(data_citing_record_1)
    data_citing_record_2 = faker.record(
        "lit", literature_citations=[cited_record_2["control_number"]])
    citing_record_2 = LiteratureRecord.create(data_citing_record_2)

    db.session.commit()

    records = [
        create_record_async("aut"),
        create_record_async("job"),
        create_record_async("jou"),
        create_record_async("exp"),
        create_record_async("con"),
        create_record_async("dat"),
        create_record_async("ins"),
    ]

    # reconnect signal before we call process_references_in_records
    models_committed.connect(index_after_commit)
    uuids = [record.id
             for record in records] + [citing_record_1.id, citing_record_2.id]

    task = process_references_in_records.delay(uuids)
    results = task.get(timeout=5)

    uuids = [str(uuid) for uuid in uuids]
    assert results == uuids

    result_cited_record_1 = InspireSearch.get_record_data_from_es(
        cited_record_1)
    expected_result_cited_record_1_citation_count = 1

    assert (expected_result_cited_record_1_citation_count ==
            result_cited_record_1["citation_count"])

    result_cited_record_2 = InspireSearch.get_record_data_from_es(
        cited_record_2)
    expected_result_cited_record_2_citation_count = 1
    assert (expected_result_cited_record_2_citation_count ==
            result_cited_record_2["citation_count"])
def test_gracefully_handle_records_updating_in_wrong_order(
        inspire_app, clean_celery_session):
    # We want to run indexing in weird order, so disable auto indexing
    models_committed.disconnect(index_after_commit)

    cited_record = LiteratureRecord.create(data=faker.record("lit"))
    record_data = faker.record(
        "lit", literature_citations=[cited_record.control_number])
    record = LiteratureRecord.create(data=record_data)
    db.session.commit()

    record = LiteratureRecord.get_record_by_pid_value(record.control_number)

    index_record(record.id, record.model.versions[-1].version_id)
    assert LiteratureSearch().get_source(
        cited_record.id)["citation_count"] == 1

    data = dict(record)
    del data["references"]

    record.update(data)
    db.session.commit()
    record = LiteratureRecord.get_record_by_pid_value(record.control_number)
    data = dict(record)
    data["titles"][0] = {"title": "New Title"}
    record.update(data)
    db.session.commit()

    record = LiteratureRecord.get_record_by_pid_value(record.control_number)

    index_record(record.id, record.model.versions[-1].version_id)

    record = LiteratureRecord.get_record_by_pid_value(record.control_number)

    assert LiteratureSearch().get_source(
        cited_record.id)["citation_count"] == 1
    assert LiteratureSearch().get_source(record.id)["titles"] == [{
        "title":
        "New Title"
    }]

    index_record(record.id, record.model.versions[-2].version_id)

    assert LiteratureSearch().get_source(
        cited_record.id)["citation_count"] == 0
    assert LiteratureSearch().get_source(record.id)["titles"] == [{
        "title":
        "New Title"
    }]
    models_committed.connect(index_after_commit)
def test_index_record_fulltext_manually(inspire_app, clean_celery_session,
                                        override_config, s3, datadir):
    metadata = {"foo": "bar"}
    pdf_path = os.path.join(datadir, "2206.04407.pdf")
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        pdf_path,
        metadata,
        **{"ContentType": "application/pdf"},
    )

    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True,
                         FEATURE_FLAG_ENABLE_FILES=False):
        data = faker.record("lit")
        data.update({
            "documents": [{
                "source":
                "arxiv",
                "fulltext":
                True,
                "filename":
                "new_doc.pdf",
                "key":
                KEY,
                "url":
                "http://www.africau.edu/images/default/sample.pdf",
            }]
        })
        rec = LiteratureRecord.create(data)
        models_committed.disconnect(index_after_commit)
        db.session.commit()
        models_committed.connect(index_after_commit)

        assert_record_not_in_es(rec["control_number"])

        rec.index_fulltext()

        def assert_record_in_es():
            current_search.flush_and_refresh("*")
            record_lit_es = (LiteratureSearch().get_record(str(
                rec.id)).execute().hits.hits[0])
            document = record_lit_es._source["documents"][0]
            assert "attachment" in document
            assert "text" not in document  # pipeline should remove it

        retry_until_pass(assert_record_in_es, timeout=90, retry_interval=5)
def test_process_references_in_records_process_author_records(
        mock_batch_index, inspire_app, clean_celery_session):
    author_record = AuthorsRecord.create(faker.record("aut"))
    lit_record = LiteratureRecord.create(
        faker.record(
            "lit",
            data={
                "authors": [{
                    "full_name": author_record["name"]["value"],
                    "record": author_record["self"],
                }]
            },
        ))
    lit_record_2 = LiteratureRecord.create(
        faker.record(
            "lit",
            data={
                "authors": [{
                    "full_name": author_record["name"]["value"],
                    "record": author_record["self"],
                }]
            },
        ))

    db.session.commit()

    def assert_records_in_es():
        lit_record_from_es = InspireSearch.get_record_data_from_es(lit_record)
        lit_record_from_es_2 = InspireSearch.get_record_data_from_es(
            lit_record_2)
        aut_record_from_es = InspireSearch.get_record_data_from_es(
            author_record)
        assert lit_record_from_es and aut_record_from_es and lit_record_from_es_2

    retry_until_pass(assert_records_in_es, retry_interval=5)

    models_committed.disconnect(index_after_commit)
    author_record["name"]["value"] = "Another Name"
    author_record.update(dict(author_record))
    db.session.commit()
    # reconnect signal before we call process_references_in_records
    models_committed.connect(index_after_commit)
    task = process_references_in_records.delay([author_record.id])

    task.get(timeout=5)

    assert sorted(mock_batch_index.mock_calls[0][1][0]) == sorted(
        [str(lit_record.id), str(lit_record_2.id)])
Exemple #17
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from flask_sqlalchemy import models_committed
    from invenio_records.receivers import record_modification
    from invenio_records.tasks.index import get_record_index
    from invenio.base.globals import cfg
    from elasticsearch.helpers import bulk as es_bulk
    from inspirehep.modules.citations.receivers import (
        catch_citations_insert,
        add_citation_count_on_insert_or_update,
        catch_citations_update
    )
    from invenio_records.signals import before_record_index, after_record_insert
    models_committed.disconnect(record_modification)
    after_record_insert.disconnect(catch_citations_insert)
    before_record_index.disconnect(add_citation_count_on_insert_or_update)
    before_record_index.disconnect(catch_citations_update)

    records_to_index = []
    try:
        for record in chunk:
            recid = json = None
            try:
                recid, json = create_record(record,
                                            force=True, dry_run=dry_run)
                index = get_record_index(json) or \
                    cfg['SEARCH_ELASTIC_DEFAULT_INDEX']
                before_record_index.send(recid, json=json, index=index)
                json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0})
                records_to_index.append(json)
            except Exception as err:
                logger.error("ERROR with record {} and json {}".format(recid, json))
                logger.exception(err)
                if broken_output:
                    broken_output_fd = open(broken_output, "a")
                    print(record, file=broken_output_fd)

        logger.info("Committing chunk")
        db.session.commit()
        logger.info("Sending chunk to elasticsearch")
        es_bulk(es, records_to_index, request_timeout=60)
    finally:
        models_committed.connect(record_modification)
        after_record_insert.connect(catch_citations_insert)
        before_record_index.connect(add_citation_count_on_insert_or_update)
        before_record_index.connect(catch_citations_update)
        db.session.close()
def test_process_references_in_records_process_conference_records(
        mock_batch_index, inspire_app, clean_celery_session):
    conf_record = ConferencesRecord.create(
        faker.record("con", data={"titles": [{
            "title": "Test conference"
        }]}))
    lit_data = {
        "publication_info": [{
            "conference_record": {
                "$ref": conf_record["self"]["$ref"]
            }
        }],
        "document_type": ["conference paper"],
    }
    lit_record = LiteratureRecord.create(faker.record("lit", data=lit_data))
    lit_record_2 = LiteratureRecord.create(faker.record("lit", data=lit_data))

    db.session.commit()

    def assert_records_in_es():
        lit_record_from_es = InspireSearch.get_record_data_from_es(lit_record)
        lit_record_from_es_2 = InspireSearch.get_record_data_from_es(
            lit_record_2)
        aut_record_from_es = InspireSearch.get_record_data_from_es(conf_record)
        assert lit_record_from_es and aut_record_from_es and lit_record_from_es_2

    retry_until_pass(assert_records_in_es, retry_interval=5)

    models_committed.disconnect(index_after_commit)
    conf_record["titles"] = [{"title": "Southern California Strings Seminar "}]
    conf_record.update(dict(conf_record))
    db.session.commit()
    # reconnect signal before we call process_references_in_records
    models_committed.connect(index_after_commit)
    task = process_references_in_records.delay([conf_record.id])

    task.get(timeout=5)
    assert sorted(mock_batch_index.mock_calls[0][1][0]) == sorted(
        [lit_record.id, lit_record_2.id])
def test_process_references_in_records(inspire_app, celery_app_with_context,
                                       celery_session_worker):
    # disconnect this signal so records don't get indexed
    models_committed.disconnect(index_after_commit)

    cited_record_1 = LiteratureRecord.create(faker.record("lit"))
    cited_record_2 = LiteratureRecord.create(faker.record("lit"))

    data_citing_record_1 = faker.record(
        "lit", literature_citations=[cited_record_1["control_number"]])
    citing_record_1 = LiteratureRecord.create(data_citing_record_1)
    data_citing_record_2 = faker.record(
        "lit", literature_citations=[cited_record_2["control_number"]])
    citing_record_2 = LiteratureRecord.create(data_citing_record_2)

    db.session.commit()

    # reconnect signal before we call process_references_in_records
    models_committed.connect(index_after_commit)

    uuids = [citing_record_1.id, citing_record_2.id]

    task = process_references_in_records.delay(uuids)

    result = task.get(timeout=5)

    result_cited_record_1 = InspireSearch.get_record_data_from_es(
        cited_record_1)
    expected_result_cited_record_1_citation_count = 1

    assert (expected_result_cited_record_1_citation_count ==
            result_cited_record_1["citation_count"])

    result_cited_record_2 = InspireSearch.get_record_data_from_es(
        cited_record_2)
    expected_result_cited_record_2_citation_count = 1
    assert (expected_result_cited_record_2_citation_count ==
            result_cited_record_2["citation_count"])
Exemple #20
0
def create_records_from_mirror_recids(self, recids):
    """Task which migrates records
    Args:
        recids: records uuids to remigrate
    Returns:
         set: set of properly processed records uuids
    """
    models_committed.disconnect(index_after_commit)
    processed_records = set()
    try:
        for recid in recids:
            LOGGER.info("Migrate record from mirror", recid=recid)
            with db.session.begin_nested():
                record = migrate_record_from_mirror(
                    LegacyRecordsMirror.query.get(recid))
            if record:
                processed_records.add(str(record.id))
            else:
                LOGGER.warning("Record is empty", recid=recid)
        db.session.commit()
    except (InvalidRequestError, OperationalError, StatementError,
            ThreadsTimeoutError):
        LOGGER.exception(
            "Error during batch processing. Retrying.",
            processed_records=list(processed_records),
            recids=recids,
        )
        raise
    except Exception:
        LOGGER.exception(
            "Got unexpected exception. Ignoring",
            processed_records=list(processed_records),
            recids=recids,
        )
    finally:
        models_committed.connect(index_after_commit)

    return list(processed_records)
Exemple #21
0
def migrate_recids_from_mirror(prod_recids, skip_files=False):
    models_committed.disconnect(index_after_commit)

    index_queue = []

    for recid in prod_recids:
        with db.session.begin_nested():
            record = migrate_record_from_mirror(
                LegacyRecordsMirror.query.get(recid),
                skip_files=skip_files,
            )
            if record and not record.get('deleted'):
                index_queue.append(create_index_op(record))
    db.session.commit()
    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        es,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )

    models_committed.connect(index_after_commit)
Exemple #22
0
def test_cli_reindex_deleted_and_redirected_records(inspire_app, cli):
    redirected = create_record("lit")
    new_record = create_record("lit")
    deleted = create_record("lit")

    # disable signals so re-indexing won't run automatically after record update
    models_committed.disconnect(index_after_commit)
    # redirect one record
    new_record_data = dict(new_record)
    new_record_data["deleted_records"] = [redirected["self"]]
    new_record.update(new_record_data)

    # delete one record
    deleted.delete()

    # re-enable signals
    models_committed.connect(index_after_commit)
    # check if deleted and redirected were left in ES
    current_search.flush_and_refresh("*")

    expected_control_numbers = [
        redirected.control_number,
        new_record.control_number,
        deleted.control_number,
    ]
    results = LiteratureSearch().query_from_iq("").execute()
    control_numbers_from_es = [x.control_number for x in results.hits]
    assert set(control_numbers_from_es) == set(expected_control_numbers)

    cli.invoke(["index", "reindex", "-p", "lit"])
    current_search.flush_and_refresh("*")

    expected_control_numbers = [new_record.control_number]
    results = LiteratureSearch().query_from_iq("").execute()
    control_numbers_from_es = [x.control_number for x in results.hits]
    assert set(control_numbers_from_es) == set(expected_control_numbers)
Exemple #23
0
def migrate_recids_from_mirror(prod_recids, skip_files=False):
    models_committed.disconnect(index_after_commit)

    index_queue = []

    for recid in prod_recids:
        with db.session.begin_nested():
            record = migrate_record_from_mirror(
                LegacyRecordsMirror.query.get(recid),
                skip_files=skip_files,
            )
            if record and not record.get('deleted'):
                index_queue.append(create_index_op(record))
    db.session.commit()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        es,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )

    models_committed.connect(index_after_commit)
def test_index_records_batch_fulltext_manually(inspire_app,
                                               clean_celery_session,
                                               override_config, s3):
    metadata = {"foo": "bar"}
    key_2 = "9bfe422f251eeaa7ec2a4dd5aebebc8a"
    key_3 = "e5892c4e59898346d307332354c6c7b8"
    create_s3_bucket(KEY)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(KEY),
        KEY,
        "this is my data",
        metadata,
    )

    create_s3_bucket(key_2)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(key_2),
        key_2,
        "this is my data",
        metadata,
    )

    create_s3_bucket(key_3)
    create_s3_file(
        current_s3_instance.get_bucket_for_file_key(key_3),
        key_3,
        "this is my data",
        metadata,
    )

    with override_config(FEATURE_FLAG_ENABLE_FULLTEXT=True,
                         FEATURE_FLAG_ENABLE_FILES=False):
        lit_record = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "key":
                        KEY,
                        "filename":
                        "2105.15193.pdf",
                        "url":
                        "https://arxiv.org/pdf/2105.15193.pdf",
                    }]
                },
            ))
        lit_record_2 = LiteratureRecord.create(
            faker.record(
                "lit",
                data={
                    "documents": [{
                        "fulltext":
                        True,
                        "hidden":
                        False,
                        "filename":
                        "new_doc.pdf",
                        "key":
                        key_2,
                        "url":
                        "http://www.africau.edu/images/default/sample.pdf",
                    }]
                },
            ))
        db.session.commit()

        def assert_records_in_es():
            lit_record_from_es = LiteratureSearch.get_record_data_from_es(
                lit_record)
            lit_record_from_es_2 = LiteratureSearch.get_record_data_from_es(
                lit_record_2)
            assert lit_record_from_es and lit_record_from_es_2

        retry_until_pass(assert_records_in_es, retry_interval=5)

        models_committed.disconnect(index_after_commit)
        lit_record["documents"].append(
            {
                "source": "arxiv",
                "fulltext": True,
                "filename": "another_doc.pdf",
                "key": key_3,
                "url": "http://www.africau.edu/images/default/sample.pdf",
            }, )
        lit_record.update(dict(lit_record))
        db.session.commit()
        # reconnect signal before we call process_references_in_records
        models_committed.connect(index_after_commit)
        task = batch_index_literature_fulltext.delay(
            [lit_record.id, lit_record_2.id])
        task.get(timeout=5)

        assert task.result == {
            "uuids": [str(lit_record.id),
                      str(lit_record_2.id)],
            "success_count": 2,
            "failures_count": 0,
            "failures": [],
        }
Exemple #25
0
def migrator():
    """Command related to migrating INSPIRE data."""
    logging.basicConfig()
    # Disable auto-indexing receiver in migration tasks
    models_committed.disconnect(receive_after_model_commit)
def test_process_references_in_records_process_self_citations(
        mock_batch_index, inspire_app, clean_celery_session,
        enable_self_citations):
    author_record = AuthorsRecord.create(
        faker.record(
            "aut",
            data={
                "name": {
                    "value": "'t Hooft, Gerardus",
                    "name_variants": ["'t Hooft, Gerard", "Hooft, Gerard T."],
                    "preferred_name": "Gerardus 't Hooft",
                },
                "ids": [
                    {
                        "value": "INSPIRE-00060582",
                        "schema": "INSPIRE ID"
                    },
                    {
                        "value": "G.tHooft.1",
                        "schema": "INSPIRE BAI"
                    },
                ],
            },
        ))
    author_record_2 = AuthorsRecord.create(
        faker.record(
            "aut",
            data={
                "name": {
                    "value": "'t Hooft, Gerardus Marcus",
                    "preferred_name": "Gerardus Marcus 't Hooft",
                },
                "ids": [
                    {
                        "value": "INSPIRE-00060583",
                        "schema": "INSPIRE ID"
                    },
                    {
                        "value": "G.tHooft.2",
                        "schema": "INSPIRE BAI"
                    },
                ],
            },
        ))
    lit_record = LiteratureRecord.create(
        faker.record(
            "lit",
            data={
                "authors": [{
                    "ids": [
                        {
                            "value": "INSPIRE-00060582",
                            "schema": "INSPIRE ID"
                        },
                        {
                            "value": "G.tHooft.1",
                            "schema": "INSPIRE BAI"
                        },
                    ],
                    "full_name":
                    author_record["name"]["value"],
                    "record":
                    author_record["self"],
                }]
            },
        ))
    lit_record_2 = LiteratureRecord.create(
        faker.record(
            "lit",
            literature_citations=[lit_record["control_number"]],
            data={
                "authors": [{
                    "ids": [
                        {
                            "value": "INSPIRE-00060583",
                            "schema": "INSPIRE ID"
                        },
                        {
                            "value": "G.tHooft.2",
                            "schema": "INSPIRE BAI"
                        },
                    ],
                    "full_name":
                    author_record_2["name"]["value"],
                    "record":
                    author_record_2["self"],
                }]
            },
        ))
    db.session.commit()

    def assert_records_in_es():
        lit_record_from_es = InspireSearch.get_record_data_from_es(lit_record)
        lit_record_from_es_2 = InspireSearch.get_record_data_from_es(
            lit_record_2)
        aut_record_from_es = InspireSearch.get_record_data_from_es(
            author_record)
        assert lit_record_from_es and aut_record_from_es and lit_record_from_es_2

    retry_until_pass(assert_records_in_es, retry_interval=5)

    models_committed.disconnect(index_after_commit)
    lit_record["authors"].append({
        "ids": [
            {
                "value": "INSPIRE-00060583",
                "schema": "INSPIRE ID"
            },
            {
                "value": "G.tHooft.2",
                "schema": "INSPIRE BAI"
            },
        ],
        "full_name":
        author_record_2["name"]["value"],
        "record":
        author_record_2["self"],
    })
    lit_record.update(dict(lit_record))
    db.session.commit()
    # reconnect signal before we call process_references_in_records
    models_committed.connect(index_after_commit)
    task = process_references_in_records.delay([lit_record.id])

    task.get(timeout=5)

    assert sorted(mock_batch_index.mock_calls[0][1][0]) == sorted(
        [lit_record_2.id])
Exemple #27
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from flask_sqlalchemy import models_committed
    from invenio_records.receivers import record_modification
    from invenio_records.tasks.index import get_record_index
    from invenio.base.globals import cfg
    from elasticsearch.helpers import bulk as es_bulk
    from inspirehep.modules.citations.receivers import (
        catch_citations_insert,
        add_citation_count_on_insert_or_update,
        catch_citations_update
    )
    from invenio_records.signals import before_record_index, after_record_insert
    models_committed.disconnect(record_modification)
    after_record_insert.disconnect(catch_citations_insert)
    before_record_index.disconnect(add_citation_count_on_insert_or_update)
    before_record_index.disconnect(catch_citations_update)

    records_to_index = []
    try:
        for raw_record in chunk:
            json = None
            record = marc_create_record(raw_record, keep_singletons=False)
            recid = int(record['001'])
            if not dry_run:
                prod_record = InspireProdRecords(recid=recid)
                prod_record.marcxml = raw_record
            try:
                with db.session.begin_nested():
                    errors, recid, json = create_record(
                        recid, record, force=True,
                        dry_run=dry_run, validation=True
                    )
                    if dry_run:
                        continue
                    prod_record.valid = not errors
                    prod_record.errors = errors
                    index = get_record_index(json) or \
                        cfg['SEARCH_ELASTIC_DEFAULT_INDEX']
                    before_record_index.send(recid, json=json, index=index)
                    json.update({'_index': index, '_type': 'record',
                                 '_id': recid, 'citation_count': 0})
                    records_to_index.append(json)
                    prod_record.successful = True
                    db.session.merge(prod_record)
            except Exception as err:
                logger.error("ERROR with record {} and json {}".format(recid, json))
                logger.exception(err)
                if not dry_run:
                    prod_record.successful = False
                    db.session.merge(prod_record)
        logger.info("Committing chunk")
        db.session.commit()
        logger.info("Sending chunk to elasticsearch")
        if not dry_run:
            es_bulk(es, records_to_index, request_timeout=60)
    finally:
        models_committed.connect(record_modification)
        after_record_insert.connect(catch_citations_insert)
        before_record_index.connect(add_citation_count_on_insert_or_update)
        before_record_index.connect(catch_citations_update)
        db.session.close()
def test_process_references_in_records_reindexes_institutions_when_linked_institutions_change(
        inspire_app, celery_app_with_context, celery_session_worker):
    # disconnect this signal so records don't get indexed
    models_committed.disconnect(index_after_commit)

    institution_data = faker.record("ins", with_control_number=True)
    institution = InspireRecord.create(institution_data)

    institution_control_number = institution["control_number"]
    inst_ref = f"http://localhost:8000/api/institutions/{institution_control_number}"

    data = faker.record("lit", with_control_number=True)
    data.update({
        "authors": [{
            "full_name":
            "John Doe",
            "affiliations": [{
                "value": "Institution",
                "record": {
                    "$ref": inst_ref
                }
            }],
        }]
    })

    record_authors_aff = InspireRecord.create(data)
    db.session.commit()

    data = faker.record("lit", with_control_number=True)
    data.update(
        {"thesis_info": {
            "institutions": [{
                "record": {
                    "$ref": inst_ref
                }
            }]
        }})

    record_thesis_info = InspireRecord.create(data)
    db.session.commit()

    data = faker.record("lit", with_control_number=True)
    data.update({
        "record_affiliations": [{
            "record": {
                "$ref": inst_ref
            },
            "value": "Institution"
        }]
    })

    record_affiliations = InspireRecord.create(data)
    db.session.commit()
    # reconnect signal before we call process_references_in_records
    models_committed.connect(index_after_commit)

    task = process_references_in_records.delay(
        [record_authors_aff.id, record_thesis_info.id, record_affiliations.id])

    task.get(timeout=5)

    institution_record_es = InspireSearch.get_record_data_from_es(institution)
    expected_number_of_paper = 3

    assert expected_number_of_paper == institution_record_es[
        "number_of_papers"]