def test_prepare_alias_blacklist_dict():
    src = [('Alias1', 'lang1', False), ('ABBREV1', 'lang1', True), ('Alias2', None, False), ('Alias3', 'lang1', False)]
    actual = prepare_alias_blacklist_dict(src, use_stemmer=False)
    expected = {
        'lang1': ([' alias1 ', ' alias3 '], [' ABBREV1 ']),
        None: ([' alias2 '], [])
    }
    assert_dict_equal(actual, expected)

    assert_true(prepare_alias_blacklist_dict([]) is None)
def test_alias_is_blacklisted():
    src = [('Alias1', 'lang1', False), ('ABBREV1', 'lang1', True),
           ('Alias2', None, False), ('Alias3', 'lang1', False)]
    prepared = prepare_alias_blacklist_dict(src, use_stemmer=False)
    assert_true(alias_is_blacklisted(prepared, ' ABBREV1 ', 'lang1', True))
    assert_false(alias_is_blacklisted(prepared, ' AAA ', 'lang1', True))
    assert_false(alias_is_blacklisted(None, 'aaaa', 'l', False))
Ejemplo n.º 3
0
def test_geoentities_alias_filtering():
    prepared_alias_blacklist = prepare_alias_blacklist_dict([('Afghanistan', None, False), ('Mississippi', 'en', False),
                                                             ('AL', 'en', True)])
    lexnlp_tests.test_extraction_func_on_test_data(get_geoentities, geo_config_list=_CONFIG,
                                                   prepared_alias_black_list=prepared_alias_blacklist,
                                                   actual_data_converter=lambda actual:
                                                   [get_entity_name(c[0]) for c in actual],
                                                   debug_print=True,
                                                   start_from_csv_line=6)
Ejemplo n.º 4
0
"""
from typing import List, Tuple, Union, Dict, Generator, Any

from lexnlp.config.en import geoentities_config
from lexnlp.extract.en.dict_entities import find_dict_entities, conflicts_take_first_by_id, \
    prepare_alias_blacklist_dict, conflicts_top_by_priority

__author__ = "ContraxSuite, LLC; LexPredict, LLC"
__copyright__ = "Copyright 2015-2019, ContraxSuite, LLC"
__license__ = "https://github.com/LexPredict/lexpredict-lexnlp/blob/master/LICENSE"
__version__ = "0.2.5"
__maintainer__ = "LexPredict, LLC"
__email__ = "*****@*****.**"

_ALIAS_BLACK_LIST_PREPARED = prepare_alias_blacklist_dict(
    geoentities_config.ALIAS_BLACK_LIST)


def get_geoentities(
    text: str,
    geo_config_list: List[Tuple[int, str, List[Tuple[str, str, bool, int]]]],
    priority: bool = False,
    priority_by_id: bool = False,
    text_languages: List[str] = None,
    min_alias_len: int = geoentities_config.MIN_ALIAS_LEN,
    prepared_alias_black_list: Union[None, Dict[str, Tuple[
        List[str], List[str]]]] = _ALIAS_BLACK_LIST_PREPARED
) -> Generator[Tuple[Tuple, Tuple], Any, Any]:
    """
    Searches for geo entities from the provided config list and yields pairs of (entity, alias).
    Entity is: (entity_id, name, [list of aliases])