def _extract_variants_from_text(self, field, text: str, **kwargs):
        geo_entities = None
        document = kwargs.get('document')
        if document is not None:
            # try to extract from GeoEntityUsage
            # pros: faster extraction
            # cons: we may extract extra entities
            geo_entities = extract_models.GeoEntityUsage.objects \
                .filter(text_unit__document=document,
                        text_unit__unit_type='sentence',
                        text_unit__text__contains=text) \
                .values_list('entity__name', flat=True)

        if not geo_entities:
            from apps.extract import dict_data_cache
            geo_config = dict_data_cache.get_geo_config()

            text_languages = None
            if document:
                text_languages = models.TextUnit.objects.filter(
                    document=document,
                    text__contains=text).values_list('language', flat=True)
                if document.language and not text_languages:
                    text_languages = [document.language]

            geo_entities = [
                i[0][1] for i in get_geoentities(text,
                                                 geo_config_list=geo_config,
                                                 text_languages=text_languages,
                                                 priority=True)
            ]
        return list(geo_entities) or None
예제 #2
0
    def parse(self, log: ProcessLogger, text, text_unit_id, text_unit_lang,
              document_initial_load: bool = False, **kwargs) -> ParseResults:
        priority = kwargs.get('priority', True)
        geo_config = dict_data_cache.get_geo_config()
        from apps.extract.app_vars import SIMPLE_LOCATOR_TOKENIZATION
        simple_norm = SIMPLE_LOCATOR_TOKENIZATION.val
        entity_alias_pairs = list(geoentities.get_geoentities(text,
                                                              geo_config,
                                                              text_languages=[text_unit_lang],
                                                              priority=priority,
                                                              simplified_normalization=simple_norm))

        entity_ids = [entity.id for entity, _alias in entity_alias_pairs]
        if entity_ids:
            unique_entities = set(entity_ids)
            alias_ids = [alias.alias_id for _entity, alias in entity_alias_pairs]
            unique_aliases = set(alias_ids)

            return ParseResults({
                GeoEntityUsage: [GeoEntityUsage(text_unit_id=text_unit_id,
                                                entity_id=idd,
                                                count=entity_ids.count(idd)) for idd in unique_entities],
                GeoAliasUsage: [GeoAliasUsage(text_unit_id=text_unit_id,
                                              alias_id=idd,
                                              count=alias_ids.count(idd)) for idd in unique_aliases if idd]})
    def parse(self,
              log: ProcessLogger,
              text,
              text_unit_id,
              text_unit_lang,
              document_initial_load: bool = False,
              **kwargs) -> ParseResults:
        priority = kwargs.get('priority', True)
        geo_config = dict_data_cache.get_geo_config()
        entity_alias_pairs = list(
            geoentities.get_geoentities(text,
                                        geo_config,
                                        text_languages=[text_unit_lang],
                                        priority=priority))

        entity_ids = [
            dict_entities.get_entity_id(entity)
            for entity, _alias in entity_alias_pairs
        ]
        if entity_ids:
            unique_entities = set(entity_ids)
            alias_ids = [
                dict_entities.get_alias_id(alias)
                for _entity, alias in entity_alias_pairs
            ]
            unique_aliases = set(alias_ids)

            return ParseResults({
                GeoEntityUsage: [
                    GeoEntityUsage(text_unit_id=text_unit_id,
                                   entity_id=idd,
                                   count=entity_ids.count(idd))
                    for idd in unique_entities
                ],
                GeoAliasUsage: [
                    GeoAliasUsage(text_unit_id=text_unit_id,
                                  alias_id=idd,
                                  count=alias_ids.count(idd))
                    for idd in unique_aliases if idd
                ]
            })