def cache_court_config(): res = [ dict_entities.entity_config( entity_id=i.id, name=i.name, priority=0, aliases=i.alias.split(';') if i.alias else []) for i in Court.objects.all() ] DbCache.put_to_db(CACHE_KEY_COURT_CONFIG, res)
def cache_term_stems(): term_stems = {} for t, pk in Term.objects.values_list('term', 'pk'): stemmed_term = ' %s ' % ' '.join(get_stems(t)) stemmed_item = term_stems.get(stemmed_term, []) stemmed_item.append([t, pk]) term_stems[stemmed_term] = stemmed_item for item in term_stems: term_stems[item] = dict(values=term_stems[item], length=len(term_stems[item])) DbCache.put_to_db(CACHE_KEY_TERM_STEMS, term_stems)
def process(self, document_field: Dict, config_csv_file: Dict, drop_previous_field_detectors: bool, update_field_choice_values: bool, **kwargs): try: self.log_info( 'Going to configure simple field detection config...') document_field = DocumentField.objects.get(pk=document_field['pk']) csv_bytes = DbCache.get(config_csv_file['cache_key']) apply_simple_config(CeleryTaskLogger(self), document_field, csv_bytes, drop_previous_field_detectors, update_field_choice_values) finally: DbCache.clean_cache(config_csv_file['cache_key'])
def courts_loader(zip_file: ZipFile, files: list) -> None: if Court.objects.exists(): print('Courts data already uploaded') return print('Uploading courts...') df = load_csv_files(zip_file, files) with transaction.atomic(): courts_count = load_courts(df) print('Detected %d courts' % courts_count) print('Caching courts config for Locate tasks...') DbCache.cache_court_config()
def terms_loader(zip_file: ZipFile, files: list) -> None: if Term.objects.exists(): print('Terms data already uploaded') return print('Uploading terms...') df = load_csv_files(zip_file, files) with transaction.atomic(): terms_count = load_terms(df) print('Detected %d terms' % terms_count) print('Caching terms config for Locate tasks...') DbCache.cache_term_stems()
def geoentities_loader(zip_file: ZipFile, files: list) -> None: if GeoEntity.objects.exists(): print('Geo config data already uploaded') return print('Uploading geo config ...') df = load_csv_files(zip_file, files) with transaction.atomic(): geo_aliases_count, geo_entities_count = load_geo_entities(df) print('Total created: %d GeoAliases' % geo_aliases_count) print('Total created: %d GeoEntities' % geo_entities_count) print('Caching geo config for Locate tasks...') DbCache.cache_geo_config()
def _extract_variants_from_text(self, field, text: str, **kwargs): geo_entities = None document = kwargs.get('document') if document is not None: # try to extract from GeoEntityUsage # pros: faster extraction # cons: we may extract extra entities geo_entities = extract_models.GeoEntityUsage.objects.filter( text_unit__document=document, text_unit__unit_type='sentence', text_unit__text__contains=text).values('entity_id', 'entity__name') if not geo_entities: from apps.task.tasks import CACHE_KEY_GEO_CONFIG from apps.common.advancedcelery.db_cache import DbCache geo_config = DbCache.get(CACHE_KEY_GEO_CONFIG) text_languages = None if document: text_languages = models.TextUnit.objects.filter( document=document, text__contains=text).values_list('language', flat=True) if document.language and not text_languages: text_languages = [document.language] geo_entities = [{'entity_id': i[0][0], 'entity__name': i[0][1]} for i in get_geoentities(text, geo_config_list=geo_config, text_languages=text_languages, priority=True)] return list(geo_entities) or None
def process(self, document_type_config_csv_file: Dict, action: str, update_cache: bool, **kwargs): if action == 'validate': save = False auto_fix_validation_errors = False remove_missed_objects = False elif action == 'validate|import': save = True auto_fix_validation_errors = False remove_missed_objects = False elif action == 'import|auto_fix|retain_missing_objects': save = True auto_fix_validation_errors = True remove_missed_objects = False elif action == 'import|auto_fix|remove_missing_objects': save = True auto_fix_validation_errors = True remove_missed_objects = True else: raise RuntimeError('Unknown action') try: json_bytes = DbCache.get( document_type_config_csv_file['cache_key']) document_type = import_document_type( json_bytes=json_bytes, save=save, auto_fix_validation_errors=auto_fix_validation_errors, remove_missed_in_dump_objects=remove_missed_objects, task=self) finally: DbCache.clean_cache(document_type_config_csv_file['cache_key']) if save and update_cache: from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING if not APP_VAR_DISABLE_RAW_DB_CACHING.val: self.log_info( 'Adapting RawDB table structure after import ...') adapt_table_structure(CeleryTaskLogger(self), document_type, force=False) ids = Document.all_objects.filter( document_type=document_type).values_list('pk', flat=True) self.log_info('Caching document field values ...') CacheDocumentFields.start_cache_document_fields_for_doc_ids( self, ids)
def normalize(task_id, key, value): try: json.dumps(value) return value except TypeError: if isinstance(value, models.Model): return SimpleObjectSerializer().serialize([value]).pop() elif isinstance(value, QuerySet): return SimpleObjectSerializer().serialize(value) elif isinstance(value, (dict, list, tuple, set)): return pre_serialize(task_id, key, value) elif isinstance(value, UploadedFile): uploaded_file = value # type: UploadedFile cache_key = str(task_id) + '__' + str(key) if key else str(task_id) DbCache.put_to_db(cache_key, uploaded_file.read()) return {'file_name': uploaded_file.name, 'cache_key': cache_key} return str(value)
def cache_geo_config(): geo_config = {} for name, pk, priority in GeoEntity.objects.values_list( 'name', 'pk', 'priority'): entity = dict_entities.entity_config(pk, name, priority or 0, name_is_alias=True) geo_config[pk] = entity for alias_id, alias_text, alias_type, entity_id, alias_lang \ in GeoAlias.objects.values_list('pk', 'alias', 'type', 'entity', 'locale'): entity = geo_config[entity_id] if entity: is_abbrev = alias_type.startswith('iso') or alias_type.startswith( 'abbrev') dict_entities.add_aliases_to_entity(entity, aliases_csv=alias_text, language=alias_lang, is_abbreviation=is_abbrev, alias_id=alias_id) res = list(geo_config.values()) DbCache.put_to_db(CACHE_KEY_GEO_CONFIG, res)
def get_term_config(): return DbCache.get(CACHE_KEY_TERM_STEMS)
def get_court_config(): return DbCache.get(CACHE_KEY_COURT_CONFIG)
def get_geo_config(): return DbCache.get(CACHE_KEY_GEO_CONFIG)