def test_plural_case_matching():
    table = entity_config(1,
                          'Table',
                          aliases=[entity_alias('tbl.', is_abbreviation=True)],
                          name_is_alias=True)

    man = entity_config(2, 'man', name_is_alias=True)

    masloboyka = entity_config(3, 'masloboyka', name_is_alias=True)

    entities = [table, man, masloboyka]

    text = 'We should detect the singular number of word "tables" here - the stemmer takes care of plural case. ' \
           'Unfortunately our stemmer is not able to convert word "men" to singular number yet :(. ' \
           'But it works for word "masloboykas" - a non existing word in English in plural case.'

    expected = (
        (table[1], 'Table'),
        (masloboyka[1], 'masloboyka'),
    )

    lexnlp_tests.test_extraction_func(
        expected,
        find_dict_entities,
        text,
        all_possible_entities=entities,
        use_stemmer=True,
        actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0])
                                              for c in actual],
        debug_print=True)
    def test_conflicts_equal_length_take_same_language(self):
        some_entity = entity_config(1, 'Some Entity', aliases=['Something'])
        some_entity1 = entity_config(
            2,
            'Some Entity1',
            aliases=[entity_alias('Some Entity One', language='fr')])
        some_entity2 = entity_config(3,
                                     'Some Entity2',
                                     aliases=['Something Two'])
        entities = [some_entity, some_entity1, some_entity2]

        text = '"Some Entity One" should not be found in this text because it is not in German language.' \
               'Shorter match - "Someeee Entityyy" should be taken instead.'

        expected = ((some_entity[1], 'Some Entity'), )

        lexnlp_tests.test_extraction_func(
            expected,
            find_dict_entities,
            text,
            all_possible_entities=entities,
            text_languages=['de'],
            actual_data_converter=lambda actual: [
                (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual
            ],
            debug_print=True)
def test_companies_and():
    """
    Test get_companies methods with CC case.
    :return:
    """
    # Example text
    example = 'Those two organizations IBM INC and LexPredict LLC are cool.'
    results = {('IBM', 'INC'), ('LexPredict', 'LLC')}
    lexnlp_tests.test_extraction_func(results, get_companies, example)
def test_company_has_type_only():
    """
    Test get_companies methods with company without name.
    :return:
    """
    # Example text
    example = 'Those two organizations IBM INC and company without name LLC are cool.'
    results = {('IBM', 'INC')}
    lexnlp_tests.test_extraction_func(results, get_companies, example)
Exemple #5
0
def test_company_upper_name():
    """
    Test get_companies methods with name_upper arg.
    :return:
    """
    # Example text
    example = 'This organization Ibm INC should be uppercased'
    results = {('IBM', 'INC')}
    lexnlp_tests.test_extraction_func(results, get_companies, example, name_upper=True)
def test_common_search_all_languages():
    some_entity = entity_config(1, 'Some Entity', aliases=['Something'])
    text = 'Some Entity should be found in this text.'

    expected = ((some_entity[1], 'Some Entity'),)

    lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=[some_entity],
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)
Exemple #7
0
def test_company_detail_type():
    """
    Test get_companies methods with detailed type option.
    :return:
    """
    # Example text
    example = 'Those two organizations IBM INC and LexPredict LLC are cool.'
    results = [('IBM', 'INC', 'CORP', 'Corporation', None),
               ('LexPredict', 'LLC', 'LLC', 'Company', None)]
    lexnlp_tests.test_extraction_func(results, get_companies, example,
                                      detail_type=True)
Exemple #8
0
def test_company_abbr_name():
    """
    Test get_companies methods with detailed type option.
    :return:
    """
    # Example text
    example = 'Those two organizations IBM INC and LexPredict LLC (LP) are cool.'
    results = [('IBM', 'INC', None),
               ('LexPredict', 'LLC', 'LP')]
    lexnlp_tests.test_extraction_func(results, get_companies, example,
                                      parse_name_abbr=True)
def test_conflicts_take_longest_match():
    some_entity = entity_config(1, 'Some Entity', aliases=['Something'])
    some_entity1 = entity_config(2, 'Some Entity One', aliases=['Something One'])
    some_entity2 = entity_config(3, 'Some Entity Two', aliases=['Something Two'])
    entities = [some_entity, some_entity1, some_entity2]

    text = '"Some Entity One" should be found in this text and "Someee Entityyy" should be ignored.'

    expected = ((some_entity1[1], 'Some Entity One'),)

    lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=entities,
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)
    def test_equal_aliases_in_dif_languages(self):
        mississippi = entity_config(1,
                                    'Mississippi',
                                    aliases=[
                                        entity_alias('MS',
                                                     is_abbreviation=True,
                                                     language='en'),
                                        entity_alias('Mississippi',
                                                     language='de'),
                                        entity_alias('Mississippi',
                                                     language='en')
                                    ])
        montserrat = entity_config(2,
                                   'Montserrat',
                                   aliases=[
                                       entity_alias('MS',
                                                    is_abbreviation=True,
                                                    language='en'),
                                       entity_alias('Montserrat',
                                                    language='de'),
                                       entity_alias('Montserrat',
                                                    language='en')
                                   ])
        canada = entity_config(3,
                               'Canada',
                               aliases=[
                                   entity_alias('CAN',
                                                is_abbreviation=True,
                                                language='en'),
                                   entity_alias('Kanada', language='de'),
                                   entity_alias('Canada', language='en')
                               ])
        entities = [mississippi, montserrat, canada]

        text = '"MS" here can mean either "MMMississippi" or "MMMontserrat" because they have equal aliases in English. ' \
               'This test is here because in one version of the code alias texts were required to be unique. ' \
               '"CCCanada" (can) should not be detected because word "can" is in lowercase here.'

        expected = ((mississippi[1], 'MS'), (montserrat[1], 'MS'))

        lexnlp_tests.test_extraction_func(
            expected,
            find_dict_entities,
            text,
            all_possible_entities=entities,
            text_languages=['en'],
            actual_data_converter=lambda actual: [
                (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual
            ],
            debug_print=True)
def test_abbreviations_simple():
    some_entity = entity_config(1, 'ITAbbrev', aliases=[entity_alias('IT', is_abbreviation=True)])
    some_entity1 = entity_config(2, 'ISAbbrev', aliases=[entity_alias('IS', is_abbreviation=True)])
    entities = [some_entity, some_entity1]

    text = '"IT\'s" entity should be detected even with "\'s" because tokenizer takes care of this kind of things. ' \
           '"ISS" entity should not be detected - bacause "is" word' \
           ' is in lowercase here and probably does not mean an abbreviation.'

    expected = ((some_entity[1], 'IT'),)

    lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=entities,
                                      text_languages=['ge'],
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)
def test_get_citations():
    """
    Test default get citation behavior.
    :return:
    """
    for (_i, text, _input_args, expected) in lexnlp_tests.iter_test_data_text_and_tuple():
        expected = [(int(volume) if volume else None,
                     reporter,
                     reporter_full_name,
                     int(page) if page else None,
                     page2,
                     court,
                     int(year) if year else None,
                     source_text)
                    for volume, reporter, reporter_full_name, page, page2, court, year, source_text in expected]

        expected_without_sources = [v[:-1] for v in expected]
        expected_with_sources = expected

        lexnlp_tests.test_extraction_func(expected_without_sources, get_citations, text, return_source=False)
        lexnlp_tests.test_extraction_func(expected_with_sources, get_citations, text, return_source=True)
def test_am_pm_abbreviations():
    am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False)
    pm = entity_config(2, 'Postmodernism', aliases=[entity_alias('PM', is_abbreviation=True)], name_is_alias=False)

    entities = [am, pm]

    lexnlp_tests.test_extraction_func([],
                                      find_dict_entities,
                                      'It is 11:00 AM or 11:00 PM now.',
                                      all_possible_entities=entities,
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)

    lexnlp_tests.test_extraction_func([(am[1], 'AM')],
                                      find_dict_entities,
                                      'It is 11:00am now in (AM). Hello!',
                                      all_possible_entities=entities,
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)

    lexnlp_tests.test_extraction_func([],
                                      find_dict_entities,
                                      'It is 11:00am now.',
                                      all_possible_entities=entities,
                                      actual_data_converter=lambda actual:
                                      [(get_entity_name(c[0]), c[1][0]) for c in actual],
                                      debug_print=True)