def cache_court_config():
    res = [
        dict_entities.entity_config(
            entity_id=i.id,
            name=i.name,
            priority=0,
            aliases=i.alias.split(';') if i.alias else [])
        for i in Court.objects.all()
    ]
    DbCache.put_to_db(CACHE_KEY_COURT_CONFIG, res)
def cache_term_stems():
    term_stems = {}
    for t, pk in Term.objects.values_list('term', 'pk'):
        stemmed_term = ' %s ' % ' '.join(get_stems(t))
        stemmed_item = term_stems.get(stemmed_term, [])
        stemmed_item.append([t, pk])
        term_stems[stemmed_term] = stemmed_item
    for item in term_stems:
        term_stems[item] = dict(values=term_stems[item],
                                length=len(term_stems[item]))
    DbCache.put_to_db(CACHE_KEY_TERM_STEMS, term_stems)
Esempio n. 3
0
 def process(self, document_field: Dict, config_csv_file: Dict,
             drop_previous_field_detectors: bool,
             update_field_choice_values: bool, **kwargs):
     try:
         self.log_info(
             'Going to configure simple field detection config...')
         document_field = DocumentField.objects.get(pk=document_field['pk'])
         csv_bytes = DbCache.get(config_csv_file['cache_key'])
         apply_simple_config(CeleryTaskLogger(self), document_field,
                             csv_bytes, drop_previous_field_detectors,
                             update_field_choice_values)
     finally:
         DbCache.clean_cache(config_csv_file['cache_key'])
def courts_loader(zip_file: ZipFile, files: list) -> None:
    if Court.objects.exists():
        print('Courts data already uploaded')
        return
    print('Uploading courts...')

    df = load_csv_files(zip_file, files)
    with transaction.atomic():
        courts_count = load_courts(df)

    print('Detected %d courts' % courts_count)
    print('Caching courts config for Locate tasks...')

    DbCache.cache_court_config()
def terms_loader(zip_file: ZipFile, files: list) -> None:
    if Term.objects.exists():
        print('Terms data already uploaded')
        return
    print('Uploading terms...')

    df = load_csv_files(zip_file, files)
    with transaction.atomic():
        terms_count = load_terms(df)

    print('Detected %d terms' % terms_count)
    print('Caching terms config for Locate tasks...')

    DbCache.cache_term_stems()
def geoentities_loader(zip_file: ZipFile, files: list) -> None:
    if GeoEntity.objects.exists():
        print('Geo config data already uploaded')
        return
    print('Uploading geo config ...')

    df = load_csv_files(zip_file, files)
    with transaction.atomic():
        geo_aliases_count, geo_entities_count = load_geo_entities(df)

    print('Total created: %d GeoAliases' % geo_aliases_count)
    print('Total created: %d GeoEntities' % geo_entities_count)
    print('Caching geo config for Locate tasks...')

    DbCache.cache_geo_config()
    def _extract_variants_from_text(self, field, text: str, **kwargs):

        geo_entities = None
        document = kwargs.get('document')
        if document is not None:
            # try to extract from GeoEntityUsage
            # pros: faster extraction
            # cons: we may extract extra entities
            geo_entities = extract_models.GeoEntityUsage.objects.filter(
                text_unit__document=document,
                text_unit__unit_type='sentence',
                text_unit__text__contains=text).values('entity_id', 'entity__name')

        if not geo_entities:
            from apps.task.tasks import CACHE_KEY_GEO_CONFIG
            from apps.common.advancedcelery.db_cache import DbCache
            geo_config = DbCache.get(CACHE_KEY_GEO_CONFIG)

            text_languages = None
            if document:
                text_languages = models.TextUnit.objects.filter(
                    document=document,
                    text__contains=text).values_list('language', flat=True)
                if document.language and not text_languages:
                    text_languages = [document.language]

            geo_entities = [{'entity_id': i[0][0], 'entity__name': i[0][1]} for i in
                            get_geoentities(text,
                                            geo_config_list=geo_config,
                                            text_languages=text_languages,
                                            priority=True)]

        return list(geo_entities) or None
    def process(self, document_type_config_csv_file: Dict, action: str,
                update_cache: bool, **kwargs):

        if action == 'validate':
            save = False
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'validate|import':
            save = True
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'import|auto_fix|retain_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = False
        elif action == 'import|auto_fix|remove_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = True
        else:
            raise RuntimeError('Unknown action')

        try:
            json_bytes = DbCache.get(
                document_type_config_csv_file['cache_key'])
            document_type = import_document_type(
                json_bytes=json_bytes,
                save=save,
                auto_fix_validation_errors=auto_fix_validation_errors,
                remove_missed_in_dump_objects=remove_missed_objects,
                task=self)
        finally:
            DbCache.clean_cache(document_type_config_csv_file['cache_key'])

        if save and update_cache:
            from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
            if not APP_VAR_DISABLE_RAW_DB_CACHING.val:
                self.log_info(
                    'Adapting RawDB table structure after import ...')
                adapt_table_structure(CeleryTaskLogger(self),
                                      document_type,
                                      force=False)
            ids = Document.all_objects.filter(
                document_type=document_type).values_list('pk', flat=True)
            self.log_info('Caching document field values ...')
            CacheDocumentFields.start_cache_document_fields_for_doc_ids(
                self, ids)
def normalize(task_id, key, value):
    try:
        json.dumps(value)
        return value
    except TypeError:
        if isinstance(value, models.Model):
            return SimpleObjectSerializer().serialize([value]).pop()
        elif isinstance(value, QuerySet):
            return SimpleObjectSerializer().serialize(value)
        elif isinstance(value, (dict, list, tuple, set)):
            return pre_serialize(task_id, key, value)
        elif isinstance(value, UploadedFile):
            uploaded_file = value  # type: UploadedFile
            cache_key = str(task_id) + '__' + str(key) if key else str(task_id)
            DbCache.put_to_db(cache_key, uploaded_file.read())
            return {'file_name': uploaded_file.name, 'cache_key': cache_key}
        return str(value)
def cache_geo_config():
    geo_config = {}
    for name, pk, priority in GeoEntity.objects.values_list(
            'name', 'pk', 'priority'):
        entity = dict_entities.entity_config(pk,
                                             name,
                                             priority or 0,
                                             name_is_alias=True)
        geo_config[pk] = entity
    for alias_id, alias_text, alias_type, entity_id, alias_lang \
            in GeoAlias.objects.values_list('pk', 'alias', 'type', 'entity', 'locale'):
        entity = geo_config[entity_id]
        if entity:
            is_abbrev = alias_type.startswith('iso') or alias_type.startswith(
                'abbrev')
            dict_entities.add_aliases_to_entity(entity,
                                                aliases_csv=alias_text,
                                                language=alias_lang,
                                                is_abbreviation=is_abbrev,
                                                alias_id=alias_id)
    res = list(geo_config.values())
    DbCache.put_to_db(CACHE_KEY_GEO_CONFIG, res)
def get_term_config():
    return DbCache.get(CACHE_KEY_TERM_STEMS)
def get_court_config():
    return DbCache.get(CACHE_KEY_COURT_CONFIG)
def get_geo_config():
    return DbCache.get(CACHE_KEY_GEO_CONFIG)