def test_plural_case_matching(): table = entity_config(1, 'Table', aliases=[entity_alias('tbl.', is_abbreviation=True)], name_is_alias=True) man = entity_config(2, 'man', name_is_alias=True) masloboyka = entity_config(3, 'masloboyka', name_is_alias=True) entities = [table, man, masloboyka] text = 'We should detect the singular number of word "tables" here - the stemmer takes care of plural case. ' \ 'Unfortunately our stemmer is not able to convert word "men" to singular number yet :(. ' \ 'But it works for word "masloboykas" - a non existing word in English in plural case.' expected = ( (table[1], 'Table'), (masloboyka[1], 'masloboyka'), ) lexnlp_tests.test_extraction_func( expected, find_dict_entities, text, all_possible_entities=entities, use_stemmer=True, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_conflicts_equal_length_take_same_language(self): some_entity = entity_config(1, 'Some Entity', aliases=['Something']) some_entity1 = entity_config( 2, 'Some Entity1', aliases=[entity_alias('Some Entity One', language='fr')]) some_entity2 = entity_config(3, 'Some Entity2', aliases=['Something Two']) entities = [some_entity, some_entity1, some_entity2] text = '"Some Entity One" should not be found in this text because it is not in German language.' \ 'Shorter match - "Someeee Entityyy" should be taken instead.' expected = ((some_entity[1], 'Some Entity'), ) lexnlp_tests.test_extraction_func( expected, find_dict_entities, text, all_possible_entities=entities, text_languages=['de'], actual_data_converter=lambda actual: [ (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual ], debug_print=True)
def test_companies_and(): """ Test get_companies methods with CC case. :return: """ # Example text example = 'Those two organizations IBM INC and LexPredict LLC are cool.' results = {('IBM', 'INC'), ('LexPredict', 'LLC')} lexnlp_tests.test_extraction_func(results, get_companies, example)
def test_company_has_type_only(): """ Test get_companies methods with company without name. :return: """ # Example text example = 'Those two organizations IBM INC and company without name LLC are cool.' results = {('IBM', 'INC')} lexnlp_tests.test_extraction_func(results, get_companies, example)
def test_company_upper_name(): """ Test get_companies methods with name_upper arg. :return: """ # Example text example = 'This organization Ibm INC should be uppercased' results = {('IBM', 'INC')} lexnlp_tests.test_extraction_func(results, get_companies, example, name_upper=True)
def test_common_search_all_languages(): some_entity = entity_config(1, 'Some Entity', aliases=['Something']) text = 'Some Entity should be found in this text.' expected = ((some_entity[1], 'Some Entity'),) lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=[some_entity], actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_company_detail_type(): """ Test get_companies methods with detailed type option. :return: """ # Example text example = 'Those two organizations IBM INC and LexPredict LLC are cool.' results = [('IBM', 'INC', 'CORP', 'Corporation', None), ('LexPredict', 'LLC', 'LLC', 'Company', None)] lexnlp_tests.test_extraction_func(results, get_companies, example, detail_type=True)
def test_company_abbr_name(): """ Test get_companies methods with detailed type option. :return: """ # Example text example = 'Those two organizations IBM INC and LexPredict LLC (LP) are cool.' results = [('IBM', 'INC', None), ('LexPredict', 'LLC', 'LP')] lexnlp_tests.test_extraction_func(results, get_companies, example, parse_name_abbr=True)
def test_conflicts_take_longest_match(): some_entity = entity_config(1, 'Some Entity', aliases=['Something']) some_entity1 = entity_config(2, 'Some Entity One', aliases=['Something One']) some_entity2 = entity_config(3, 'Some Entity Two', aliases=['Something Two']) entities = [some_entity, some_entity1, some_entity2] text = '"Some Entity One" should be found in this text and "Someee Entityyy" should be ignored.' expected = ((some_entity1[1], 'Some Entity One'),) lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_equal_aliases_in_dif_languages(self): mississippi = entity_config(1, 'Mississippi', aliases=[ entity_alias('MS', is_abbreviation=True, language='en'), entity_alias('Mississippi', language='de'), entity_alias('Mississippi', language='en') ]) montserrat = entity_config(2, 'Montserrat', aliases=[ entity_alias('MS', is_abbreviation=True, language='en'), entity_alias('Montserrat', language='de'), entity_alias('Montserrat', language='en') ]) canada = entity_config(3, 'Canada', aliases=[ entity_alias('CAN', is_abbreviation=True, language='en'), entity_alias('Kanada', language='de'), entity_alias('Canada', language='en') ]) entities = [mississippi, montserrat, canada] text = '"MS" here can mean either "MMMississippi" or "MMMontserrat" because they have equal aliases in English. ' \ 'This test is here because in one version of the code alias texts were required to be unique. ' \ '"CCCanada" (can) should not be detected because word "can" is in lowercase here.' expected = ((mississippi[1], 'MS'), (montserrat[1], 'MS')) lexnlp_tests.test_extraction_func( expected, find_dict_entities, text, all_possible_entities=entities, text_languages=['en'], actual_data_converter=lambda actual: [ (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual ], debug_print=True)
def test_abbreviations_simple(): some_entity = entity_config(1, 'ITAbbrev', aliases=[entity_alias('IT', is_abbreviation=True)]) some_entity1 = entity_config(2, 'ISAbbrev', aliases=[entity_alias('IS', is_abbreviation=True)]) entities = [some_entity, some_entity1] text = '"IT\'s" entity should be detected even with "\'s" because tokenizer takes care of this kind of things. ' \ '"ISS" entity should not be detected - bacause "is" word' \ ' is in lowercase here and probably does not mean an abbreviation.' expected = ((some_entity[1], 'IT'),) lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=entities, text_languages=['ge'], actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_get_citations(): """ Test default get citation behavior. :return: """ for (_i, text, _input_args, expected) in lexnlp_tests.iter_test_data_text_and_tuple(): expected = [(int(volume) if volume else None, reporter, reporter_full_name, int(page) if page else None, page2, court, int(year) if year else None, source_text) for volume, reporter, reporter_full_name, page, page2, court, year, source_text in expected] expected_without_sources = [v[:-1] for v in expected] expected_with_sources = expected lexnlp_tests.test_extraction_func(expected_without_sources, get_citations, text, return_source=False) lexnlp_tests.test_extraction_func(expected_with_sources, get_citations, text, return_source=True)
def test_am_pm_abbreviations(): am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False) pm = entity_config(2, 'Postmodernism', aliases=[entity_alias('PM', is_abbreviation=True)], name_is_alias=False) entities = [am, pm] lexnlp_tests.test_extraction_func([], find_dict_entities, 'It is 11:00 AM or 11:00 PM now.', all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True) lexnlp_tests.test_extraction_func([(am[1], 'AM')], find_dict_entities, 'It is 11:00am now in (AM). Hello!', all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True) lexnlp_tests.test_extraction_func([], find_dict_entities, 'It is 11:00am now.', all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)