Esempio n. 1
0
def fuzzy_match(obj, eng):
    """Return ``True`` if a similar record is found in the system.

    Uses a custom configuration for ``inspire-matcher`` to find records
    similar to the current workflow object's payload in the system.

    Also sets the ``matches.fuzzy`` property in ``extra_data`` to the list of
    the brief of first 5 record that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    if not current_app.config.get('FEATURE_FLAG_ENABLE_FUZZY_MATCHER'):
        return False

    fuzzy_match_config = current_app.config['FUZZY_MATCH']
    matches = dedupe_list(match(obj.data, fuzzy_match_config))
    record_ids = [_get_hep_record_brief(el['_source']) for el in matches]
    obj.extra_data.setdefault('matches', {})['fuzzy'] = record_ids[0:5]
    return bool(record_ids)
Esempio n. 2
0
def test_match_raises_if_inner_hits_param_has_wrong_config():
    config = {
        'algorithm': [
            {
                "queries": [
                    {
                        "paths": ["first_name", "last_name"],
                        "search_paths":
                        ["authors.first_name", "authors.last_name"],
                        "type": "nested",
                        "inner_hits": {
                            "not_existing_argument": ["authors.record"]
                        }
                    },
                ],
            },
        ],
        'doc_type':
        'hep',
        'index':
        'records-hep',
    }

    with pytest.raises(ValueError) as excinfo:
        list(match(None, config))
    assert 'Malformed query' in str(excinfo.value)
Esempio n. 3
0
def article_exists(obj, eng):
    """Return ``True`` if the record is already present in the system.

    Uses the default configuration of the ``inspire-matcher`` to find
    duplicates of the current workflow object in the system.

    Also sets the ``record_matches`` property in ``extra_data`` to the list of
    control numbers that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    matches = dedupe_list(match(obj.data))
    record_ids = [el['_source']['control_number'] for el in matches]
    if record_ids:
        obj.extra_data['record_matches'] = record_ids
        return True

    obj.extra_data['record_matches'] = []
    return False
Esempio n. 4
0
def fuzzy_match(obj, eng):
    """Return ``True`` if a similar record is found in the system.

    Uses a custom configuration for ``inspire-matcher`` to find records
    similar to the current workflow object's payload in the system.

    Also sets the ``matches.fuzzy`` property in ``extra_data`` to the list of
    the brief of first 5 record that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    if not current_app.config.get('FEATURE_FLAG_ENABLE_FUZZY_MATCHER'):
        return False

    fuzzy_match_config = current_app.config['FUZZY_MATCH']
    matches = dedupe_list(match(obj.data, fuzzy_match_config))
    record_ids = [_get_hep_record_brief(el['_source']) for el in matches]
    obj.extra_data.setdefault('matches', {})['fuzzy'] = record_ids[0:5]
    return bool(record_ids)
Esempio n. 5
0
def match_literature_author_with_config(author_data, matcher_config):
    matched_records = [
        matched_record for matched_record in match(author_data, matcher_config)
        if get_value(matched_record,
                     "inner_hits.authors.hits.hits[0]._source.record.$ref")
    ]
    return matched_records
Esempio n. 6
0
def article_exists(obj, eng):
    """Return ``True`` if the record is already present in the system.

    Uses the default configuration of the ``inspire-matcher`` to find
    duplicates of the current workflow object in the system.

    Also sets the ``record_matches`` property in ``extra_data`` to the list of
    control numbers that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    matches = dedupe_list(match(obj.data))
    record_ids = [el['_source']['control_number'] for el in matches]
    if record_ids:
        obj.extra_data['record_matches'] = record_ids
        return True

    obj.extra_data['record_matches'] = []
    return False
Esempio n. 7
0
def test_match_raises_if_the_configuration_does_not_have_all_the_keys():
    config = {
        'doc_type': 'hep',
        'index': 'records-hep',
    }

    with pytest.raises(KeyError) as excinfo:
        list(match(None, config))
    assert 'Malformed configuration' in str(excinfo.value)
Esempio n. 8
0
def match_author(author):
    matched_authors = match(author, current_app.config["AUTHOR_MATCHER_EXACT_CONFIG"])
    matched_refs_ids = {
        matched_author["_source"]["self"]["$ref"]: matched_author["_source"].get(
            "ids", []
        )
        for matched_author in matched_authors
    }
    matched_author_data = get_reference_and_bai_if_unambiguous_match(matched_refs_ids)
    return matched_author_data
Esempio n. 9
0
def test_match_raises_if_one_step_of_the_algorithm_has_no_queries():
    config = {
        'algorithm': [
            {'validator': 'inspire_matcher.validators:default_validator'},
        ],
        'doc_type': 'hep',
        'index': 'records-hep',
    }

    with pytest.raises(KeyError) as excinfo:
        list(match(None, config))
    assert 'Malformed algorithm' in str(excinfo.value)
Esempio n. 10
0
def pending_in_holding_pen(obj, eng):
    """Return ``True`` if the record is already present in the Holding Pen.

    Uses a custom configuration of the ``inspire-matcher`` to find duplicates
    of the current workflow object in the Holding Pen.

    Also sets ``holdingpen_matches`` in ``extra_data`` to the list of ids that
    matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the Holding
        Pen, ``False`` otherwise.

    """
    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_eprints.value',
                        'search_path': 'metadata.arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'dois.value',
                        'search_path': 'metadata.dois.value.raw',
                        'type': 'exact',
                    },
                ],
            },
        ],
        'doc_type':
        'hep',
        'index':
        'holdingpen-hep',
    }

    matches = dedupe_list(match(obj.data, config))
    holdingpen_ids = [
        int(el['_id']) for el in matches if int(el['_id']) != obj.id
    ]
    if holdingpen_ids:
        obj.extra_data['holdingpen_matches'] = holdingpen_ids
        return True

    return False
Esempio n. 11
0
def duplicated_validator(property_name, property_value):
    def _is_not_deleted(base_record, match_result):
        return not get_value(match_result, '_source.deleted', default=False)

    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_id',
                        'search_path': 'arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'doi',
                        'search_path': 'dois.value.raw',
                        'type': 'exact',
                    },
                ],
                'validator':
                _is_not_deleted,
            },
        ],
        'doc_type':
        'hep',
        'index':
        'records-hep',
    }

    if property_name == 'arXiv ID':
        data = {
            'arxiv_id': property_value,
        }
    if property_name == 'DOI':
        data = {
            'doi': property_value,
        }

    matches = dedupe_list(match(data, config))
    matched_ids = [int(el['_source']['control_number']) for el in matches]
    if matched_ids:
        url = url_for(
            'invenio_records_ui.literature',
            pid_value=matched_ids[0],
        )
        raise ValidationError(
            'There exists already an item with the same %s. '
            '<a target="_blank" href="%s">See the record.</a>' %
            (property_name, url))
Esempio n. 12
0
def test_match_raises_if_one_query_does_not_have_a_type():
    config = {
        'algorithm': [
            {
                'queries': [
                    {},
                ],
            },
        ],
        'doc_type': 'hep',
        'index': 'records-hep',
    }

    with pytest.raises(ValueError) as excinfo:
        list(match(None, config))
    assert 'Malformed query' in str(excinfo.value)
Esempio n. 13
0
def test_match_raises_if_one_query_type_is_not_supported():
    config = {
        'algorithm': [
            {
                'queries': [
                    {'type': 'not-supported'},
                ],
            },
        ],
        'doc_type': 'records',
        'index': 'records-hep',
    }

    with pytest.raises(ValueError) as excinfo:
        list(match(None, config))
    assert 'Malformed query. Query 0 of step 0 does not compile: type "not-supported" is not supported.' in str(excinfo.value)
Esempio n. 14
0
def duplicated_validator(property_name, property_value):
    def _is_not_deleted(base_record, match_result):
        return not get_value(match_result, '_source.deleted', default=False)

    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_id',
                        'search_path': 'arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'doi',
                        'search_path': 'dois.value.raw',
                        'type': 'exact',
                    },
                ],
                'validator': _is_not_deleted,
            },
        ],
        'doc_type': 'hep',
        'index': 'records-hep',
    }

    if property_name == 'arXiv ID':
        data = {
            'arxiv_id': property_value,
        }
    if property_name == 'DOI':
        data = {
            'doi': property_value,
        }

    matches = dedupe_list(match(data, config))
    matched_ids = [int(el['_source']['control_number']) for el in matches]
    if matched_ids:
        url = url_for(
            'invenio_records_ui.literature',
            pid_value=matched_ids[0],
        )
        raise ValidationError(
            'There exists already an item with the same %s. '
            '<a target="_blank" href="%s">See the record.</a>'
            % (property_name, url)
        )
Esempio n. 15
0
def update_references_pointing_to_merged_record(refs_to_schema,
                                                merged_record_uri,
                                                new_record_uri):
    for index, path in refs_to_schema:
        config = get_config_for_given_path(index, path)
        matched_records = match({"$ref": merged_record_uri}, config)
        for matched_record in matched_records:
            matched_inspire_record = InspireRecord.get_record(
                matched_record["_id"], with_deleted=True)
            referenced_records_in_path = flatten_list(
                get_value(matched_inspire_record, path[:-len(".$ref")], []))
            for referenced_record in referenced_records_in_path:
                update_reference_if_reference_uri_matches(
                    referenced_record, merged_record_uri, new_record_uri)
            matched_inspire_record.update(dict(matched_inspire_record))
            LOGGER.info("Updated reference for record",
                        uuid=str(matched_inspire_record.id))
    db.session.commit()
Esempio n. 16
0
def test_validator_list(es_mock):

    es_mock.search.return_value = {
        'hits': {
            'hits': {
                'dummy result',
            }
        }
    }

    dummy_validator_1 = mock.Mock()
    dummy_validator_1.return_value = True
    dummy_validator_2 = mock.Mock()
    dummy_validator_2.return_value = True

    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'type': 'exact',
                        'path': 'dummy.path',
                        'search_path': 'dummy.search.path',
                    },
                ],
                'validator': [dummy_validator_1, dummy_validator_2],
            },
        ],
        'doc_type':
        'hep',
        'index':
        'records-hep',
    }
    record = {
        'dummy': {
            'path': 'Non empty value',
        },
    }

    result = list(match(record, config))
    assert 'dummy result' in result
    dummy_validator_1.assert_called_with(record, 'dummy result')
    dummy_validator_2.assert_called_with(record, 'dummy result')
Esempio n. 17
0
def test_match_raises_if_an_exact_query_does_not_have_all_the_keys():
    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'search_path': 'arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                ],
            },
        ],
        'doc_type': 'hep',
        'index': 'records-hep',
    }

    with pytest.raises(ValueError) as excinfo:
        list(match(None, config))
    assert 'Malformed query' in str(excinfo.value)
Esempio n. 18
0
def _pending_in_holding_pen(obj, validation_func):
    """Return the list of matching workflows in the holdingpen.

    Matches the holdingpen records by their ``arxiv_eprint``, their ``doi``,
    and by a custom validator function.

    Args:
        obj: a workflow object.
        validation_func: a function used to filter the matched records.

    Returns:
        (list): the ids matching the current ``obj`` that satisfy
        ``validation_func``.

    """
    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_eprints.value',
                        'search_path': 'metadata.arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'dois.value',
                        'search_path': 'metadata.dois.value.raw',
                        'type': 'exact',
                    },
                ],
                'validator':
                validation_func,
            },
        ],
        'doc_type':
        'hep',
        'index':
        'holdingpen-hep',
    }
    matches = dedupe_list(match(obj.data, config))
    return [int(el['_id']) for el in matches if int(el['_id']) != obj.id]
Esempio n. 19
0
def test_match_raises_on_invalid_collections():
    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'search_path': 'arxiv_eprints.value.raw',
                        'path': 'arxiv_eprints.value',
                        'type': 'exact',
                    },
                ],
            },
        ],
        'doc_type': 'hep',
        'index': 'records-hep',
        'collections': 'Literature',
    }

    with pytest.raises(ValueError) as excinfo:
        list(match(None, config))
    assert 'Malformed collections' in str(excinfo.value)
Esempio n. 20
0
def test_match_uses_the_given_validator_callable(es_mock):
    es_mock.search.return_value = {
        'hits': {
            'hits': {
                'dummy result',
            }
        }
    }
    dummy_validator = mock.Mock()
    dummy_validator.return_value = False

    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'type': 'exact',
                        'path': 'dummy.path',
                        'search_path': 'dummy.search.path',
                    },
                ],
                'validator':
                dummy_validator,
            },
        ],
        'doc_type':
        'hep',
        'index':
        'records-hep',
    }

    record = {
        'dummy': {
            'path': 'Non empty value',
        },
    }
    result = list(match(record, config))
    assert not result
    dummy_validator.assert_called_with(record, 'dummy result')
Esempio n. 21
0
def _pending_in_holding_pen(obj, validation_func):
    """Return the list of matching workflows in the holdingpen.

    Matches the holdingpen records by their ``arxiv_eprint``, their ``doi``,
    and by a custom validator function.

    Args:
        obj: a workflow object.
        validation_func: a function used to filter the matched records.

    Returns:
        (list): the ids matching the current ``obj`` that satisfy
        ``validation_func``.

    """
    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_eprints.value',
                        'search_path': 'metadata.arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'dois.value',
                        'search_path': 'metadata.dois.value.raw',
                        'type': 'exact',
                    },
                ],
                'validator': validation_func,
            },
        ],
        'doc_type': 'hep',
        'index': 'holdingpen-hep',
    }
    matches = dedupe_list(match(obj.data, config))
    return [int(el['_id']) for el in matches if int(el['_id']) != obj.id]
Esempio n. 22
0
def exact_match(obj, eng):
    """Return ``True`` if the record is already present in the system.

    Uses the default configuration of the ``inspire-matcher`` to find
    duplicates of the current workflow object in the system.

    Also sets the ``matches.exact`` property in ``extra_data`` to the list of
    control numbers that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    exact_match_config = current_app.config['EXACT_MATCH']
    matches = dedupe_list(match(obj.data, exact_match_config))
    record_ids = [el['_source']['control_number'] for el in matches]
    obj.extra_data.setdefault('matches', {})['exact'] = record_ids
    return bool(record_ids)
Esempio n. 23
0
def exact_match(obj, eng):
    """Return ``True`` if the record is already present in the system.

    Uses the default configuration of the ``inspire-matcher`` to find
    duplicates of the current workflow object in the system.

    Also sets the ``matches.exact`` property in ``extra_data`` to the list of
    control numbers that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    exact_match_config = current_app.config['EXACT_MATCH']
    matches = dedupe_list(match(obj.data, exact_match_config))
    record_ids = [el['_source']['control_number'] for el in matches]
    obj.extra_data.setdefault('matches', {})['exact'] = record_ids
    return bool(record_ids)