def test_am_pm_none(self): am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False) pm = entity_config(2, 'Postmodernism', aliases=[entity_alias('PM', is_abbreviation=True)], name_is_alias=False) entities = [am, pm] ents = list( find_dict_entities('It is 11:00 AM or 11:00 PM now.', all_possible_entities=entities)) self.assertEqual(0, len(ents)) ents = list( find_dict_entities('It is 11:00am now in (AM). Hello!', all_possible_entities=entities)) self.assertEqual(1, len(ents)) self.assertEqual('America', ents[0].entity[0][1]) ents = list( find_dict_entities('It is 11:00am now.', all_possible_entities=entities)) self.assertEqual(0, len(ents))
def test_am_pm_abbreviations(): am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False) pm = entity_config(2, 'Postmodernism', aliases=[entity_alias('PM', is_abbreviation=True)], name_is_alias=False) entities = [am, pm] lexnlp_tests.test_extraction_func([], find_dict_entities, 'It is 11:00 AM or 11:00 PM now.', all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True) lexnlp_tests.test_extraction_func([(am[1], 'AM')], find_dict_entities, 'It is 11:00am now in (AM). Hello!', all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True) lexnlp_tests.test_extraction_func([], find_dict_entities, 'It is 11:00am now.', all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_abbreviations_simple(): some_entity = entity_config(1, 'ITAbbrev', aliases=[entity_alias('IT', is_abbreviation=True)]) some_entity1 = entity_config(2, 'ISAbbrev', aliases=[entity_alias('IS', is_abbreviation=True)]) entities = [some_entity, some_entity1] text = '"IT\'s" entity should be detected even with "\'s" because tokenizer takes care of this kind of things. ' \ '"ISS" entity should not be detected - bacause "is" word' \ ' is in lowercase here and probably does not mean an abbreviation.' expected = ((some_entity[1], 'IT'),) lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=entities, text_languages=['ge'], actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_plural_case_matching(): table = entity_config(1, 'Table', aliases=[entity_alias('tbl.', is_abbreviation=True)], name_is_alias=True) man = entity_config(2, 'man', name_is_alias=True) masloboyka = entity_config(3, 'masloboyka', name_is_alias=True) entities = [table, man, masloboyka] text = 'We should detect the singular number of word "tables" here - the stemmer takes care of plural case. ' \ 'Unfortunately our stemmer is not able to convert word "men" to singular number yet :(. ' \ 'But it works for word "masloboykas" - a non existing word in English in plural case.' expected = ( (table[1], 'Table'), (masloboyka[1], 'masloboyka'), ) lexnlp_tests.test_extraction_func( expected, find_dict_entities, text, all_possible_entities=entities, use_stemmer=True, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_conflicts_equal_length_take_same_language(self): some_entity = entity_config(1, 'Some Entity', aliases=['Something']) some_entity1 = entity_config( 2, 'Some Entity1', aliases=[entity_alias('Some Entity One', language='fr')]) some_entity2 = entity_config(3, 'Some Entity2', aliases=['Something Two']) entities = [some_entity, some_entity1, some_entity2] text = '"Some Entity One" should not be found in this text because it is not in German language.' \ 'Shorter match - "Someeee Entityyy" should be taken instead.' expected = ((some_entity[1], 'Some Entity'), ) lexnlp_tests.test_extraction_func( expected, find_dict_entities, text, all_possible_entities=entities, text_languages=['de'], actual_data_converter=lambda actual: [ (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual ], debug_print=True)
def test_equal_aliases_in_dif_languages(self): mississippi = entity_config(1, 'Mississippi', aliases=[ entity_alias('MS', is_abbreviation=True, language='en'), entity_alias('Mississippi', language='de'), entity_alias('Mississippi', language='en') ]) montserrat = entity_config(2, 'Montserrat', aliases=[ entity_alias('MS', is_abbreviation=True, language='en'), entity_alias('Montserrat', language='de'), entity_alias('Montserrat', language='en') ]) canada = entity_config(3, 'Canada', aliases=[ entity_alias('CAN', is_abbreviation=True, language='en'), entity_alias('Kanada', language='de'), entity_alias('Canada', language='en') ]) entities = [mississippi, montserrat, canada] text = '"MS" here can mean either "MMMississippi" or "MMMontserrat" because they have equal aliases in English. ' \ 'This test is here because in one version of the code alias texts were required to be unique. ' \ '"CCCanada" (can) should not be detected because word "can" is in lowercase here.' expected = ((mississippi[1], 'MS'), (montserrat[1], 'MS')) lexnlp_tests.test_extraction_func( expected, find_dict_entities, text, all_possible_entities=entities, text_languages=['en'], actual_data_converter=lambda actual: [ (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual ], debug_print=True)
def test_find_dict_entities_empty_text(): text = '' am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False) res = list(find_dict_entities(text, [am])) assert_false(res)
def test_get_alias_text(): alias = entity_alias('alias', 'lang', False, 123) assert_equals('alias', get_alias_text(alias))
def test_get_alias_id(self): alias = entity_alias('alias', 'lang', False, 123) assert_equals(123, get_alias_id(alias))