def _extract_variants_from_text(self, field, text: str, **kwargs): geo_entities = None document = kwargs.get('document') if document is not None: # try to extract from GeoEntityUsage # pros: faster extraction # cons: we may extract extra entities geo_entities = extract_models.GeoEntityUsage.objects \ .filter(text_unit__document=document, text_unit__unit_type='sentence', text_unit__text__contains=text) \ .values_list('entity__name', flat=True) if not geo_entities: from apps.extract import dict_data_cache geo_config = dict_data_cache.get_geo_config() text_languages = None if document: text_languages = models.TextUnit.objects.filter( document=document, text__contains=text).values_list('language', flat=True) if document.language and not text_languages: text_languages = [document.language] geo_entities = [ i[0][1] for i in get_geoentities(text, geo_config_list=geo_config, text_languages=text_languages, priority=True) ] return list(geo_entities) or None
def parse(self, log: ProcessLogger, text, text_unit_id, text_unit_lang, document_initial_load: bool = False, **kwargs) -> ParseResults: priority = kwargs.get('priority', True) geo_config = dict_data_cache.get_geo_config() from apps.extract.app_vars import SIMPLE_LOCATOR_TOKENIZATION simple_norm = SIMPLE_LOCATOR_TOKENIZATION.val entity_alias_pairs = list(geoentities.get_geoentities(text, geo_config, text_languages=[text_unit_lang], priority=priority, simplified_normalization=simple_norm)) entity_ids = [entity.id for entity, _alias in entity_alias_pairs] if entity_ids: unique_entities = set(entity_ids) alias_ids = [alias.alias_id for _entity, alias in entity_alias_pairs] unique_aliases = set(alias_ids) return ParseResults({ GeoEntityUsage: [GeoEntityUsage(text_unit_id=text_unit_id, entity_id=idd, count=entity_ids.count(idd)) for idd in unique_entities], GeoAliasUsage: [GeoAliasUsage(text_unit_id=text_unit_id, alias_id=idd, count=alias_ids.count(idd)) for idd in unique_aliases if idd]})
def _extract_variants_from_text(self, field, text: str, **kwargs): geo_entities = None document = kwargs.get('document') if document is not None: # try to extract from GeoEntityUsage # pros: faster extraction # cons: we may extract extra entities geo_entities = extract_models.GeoEntityUsage.objects.filter( text_unit__document=document, text_unit__unit_type='sentence', text_unit__text__contains=text).values('entity_id', 'entity__name') if not geo_entities: from apps.task.tasks import CACHE_KEY_GEO_CONFIG from apps.common.advancedcelery.db_cache import DbCache geo_config = DbCache.get(CACHE_KEY_GEO_CONFIG) text_languages = None if document: text_languages = models.TextUnit.objects.filter( document=document, text__contains=text).values_list('language', flat=True) if document.language and not text_languages: text_languages = [document.language] geo_entities = [{'entity_id': i[0][0], 'entity__name': i[0][1]} for i in get_geoentities(text, geo_config_list=geo_config, text_languages=text_languages, priority=True)] return list(geo_entities) or None
def en_parsers_speed(self): file_path = os.path.join(lexnlp_test_path, 'long_parsed_text.txt') with codecs.open(file_path, 'r', encoding='utf-8') as fr: text = fr.read() ge_path = os.path.join(lexnlp_test_path, 'lexnlp/extract/en/tests/test_geoentities/') entities_fn = ge_path + 'geoentities.csv' aliases_fn = ge_path + 'geoaliases.csv' geo_config = list(DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn)) times = {} # type: Dict[str, float] self.check_time(text, lambda s: list(get_amounts(s)), 'get_amounts', times) self.check_time(text, lambda s: list(get_acts(s)), 'get_acts', times) self.check_time(text, lambda s: list(get_citations(s)), 'get_citations', times) self.check_time(text, lambda s: list(get_conditions(s)), 'get_conditions', times) self.check_time(text, lambda s: list(get_constraints(s)), 'get_constraints', times) self.check_time(text, lambda s: list(get_copyright(s)), 'get_copyright', times) self.check_time(text, lambda s: list(_get_courts(s)), 'get_courts', times) self.check_time(text, lambda s: list(get_cusip(s)), 'get_cusip', times) self.check_time(text, lambda s: list(get_dates(s)), 'get_dates', times) self.check_time(text, lambda s: list(get_definitions(s)), 'get_definitions', times) self.check_time(text, lambda s: list(get_distances(s)), 'get_distances', times) self.check_time(text, lambda s: list(get_durations(s)), 'get_durations', times) self.check_time(text, lambda s: list(get_geoentities(s, geo_config)), 'get_geoentities', times) self.check_time(text, lambda s: list(get_money(s)), 'get_money', times) self.check_time(text, lambda s: list(get_percents(s)), 'get_percents', times) self.check_time(text, lambda s: list(get_pii(s)), 'get_pii', times) self.check_time(text, lambda s: list(get_ratios(s)), 'get_ratios', times) self.check_time(text, lambda s: list(get_regulations(s)), 'get_regulations', times) self.check_time(text, lambda s: list(get_trademarks(s)), 'get_trademarks', times) self.check_time(text, lambda s: list(get_urls(s)), 'get_urls', times) self.assertTrue('get_amounts' in times)
def get_geoentities_routine( text: str, geo_config_list: List[DictionaryEntry], conflict_resolving_field: str = 'none', priority_direction: str = 'asc', text_languages: Optional[str] = None, min_alias_len: Optional[int] = None, prepared_alias_ban_list: Optional[ Dict[str, Tuple[List[str], List[str]]]] = None, simplified_normalization: bool = False) -> \ Generator[Tuple[DictionaryEntry, DictionaryEntryAlias], Any, Any]: yield from get_geoentities(text, geo_config_list, conflict_resolving_field, priority_direction, [text_languages] if text_languages else None, min_alias_len, prepared_alias_ban_list, simplified_normalization)
def parse(self, log: ProcessLogger, text, text_unit_id, text_unit_lang, document_initial_load: bool = False, **kwargs) -> ParseResults: priority = kwargs.get('priority', True) geo_config = dict_data_cache.get_geo_config() entity_alias_pairs = list( geoentities.get_geoentities(text, geo_config, text_languages=[text_unit_lang], priority=priority)) entity_ids = [ dict_entities.get_entity_id(entity) for entity, _alias in entity_alias_pairs ] if entity_ids: unique_entities = set(entity_ids) alias_ids = [ dict_entities.get_alias_id(alias) for _entity, alias in entity_alias_pairs ] unique_aliases = set(alias_ids) return ParseResults({ GeoEntityUsage: [ GeoEntityUsage(text_unit_id=text_unit_id, entity_id=idd, count=entity_ids.count(idd)) for idd in unique_entities ], GeoAliasUsage: [ GeoAliasUsage(text_unit_id=text_unit_id, alias_id=idd, count=alias_ids.count(idd)) for idd in unique_aliases if idd ] })
def test_multiline_address(self): text = """ Sincerely, DUKE REALTY CORPORATION Ana M. Hernandez Property Administrator 2400 North Commerce Parkway Suite 405 Weston, FL 33326 Main: 954-453-5660 P: 954-453-5265 F: 954.453.5695 [email protected] www.dukerealty.com Cc: File LEASE """ ds = list(get_geoentities(text, GEO_CONFIG)) self.assertEqual(1, len(ds)) # how come?
def test_geoentities_counting(): text = 'And AND AND AND And' actual = list(get_geoentities(text, geo_config_list=_CONFIG)) assert len(actual) == 3