def make_records_untyped(_apps, _schema_editor): for cache_key in CACHE_KEYS: untyped = [] records = DbCache.get(cache_key) for record in records: if record.__class__.__name__ != 'DictionaryEntry': untyped.append(record) continue try: aliases = [(a.alias, a.language, a.is_abbreviation, a.alias_id, a.normalized_alias) for a in record.aliases] rec = ( record.id, record.name, record.priority, aliases, ) untyped.append(rec) except Exception as e: print( f'Unable to cast a DictionaryEntry in "{cache_key}" to a tuple: {e}' ) DbCache.put_to_db(cache_key, untyped) if DbCache.INSTANCE: DbCache.INSTANCE.stop_watching()
def make_records_typed(_apps, _schema_editor): for cache_key in CACHE_KEYS: typed = [] # type: List[DictionaryEntry] records = DbCache.get(cache_key) for record in records: if record.__class__.__name__ == 'DictionaryEntry': typed.append(record) continue try: aliases = [ DictionaryEntryAlias(alias, lang, is_abbr, alias_id, norm_als) for alias, lang, is_abbr, alias_id, norm_als in record[3] ] rec = DictionaryEntry(record[0], record[1], priority=record[2], aliases=aliases) typed.append(rec) except Exception as e: print( f'Unable to cast a record in "{cache_key}" to DictionaryEntry: {e}' ) DbCache.put_to_db(cache_key, typed) if DbCache.INSTANCE: DbCache.INSTANCE.stop_watching()
def cache_court_config(): res = [dict_entities.entity_config( entity_id=i.id, name=i.name, priority=0, aliases=i.alias.split(';') if i.alias else [] ) for i in Court.objects.all()] DbCache.put_to_db(CACHE_KEY_COURT_CONFIG, res)
def get_term_config(project_id=None): res = None if project_id is not None: key = CACHE_KEY_TERM_STEMS_PROJECT_PTN.format(project_id) res = DbCache.get(key) if res is None: res = DbCache.get(CACHE_KEY_TERM_STEMS) return res
def cache_term_stems(): term_stems = {} for t, pk in Term.objects.values_list('term', 'pk'): stemmed_term = ' %s ' % ' '.join(get_stems(t)) stemmed_item = term_stems.get(stemmed_term, []) stemmed_item.append([t, pk]) term_stems[stemmed_term] = stemmed_item for item in term_stems: term_stems[item] = dict(values=term_stems[item], length=len(term_stems[item])) DbCache.put_to_db(CACHE_KEY_TERM_STEMS, term_stems)
def process(self, document_type_config_csv_file: Dict, action: str, update_cache: bool, **kwargs): if action == 'validate': save = False auto_fix_validation_errors = False remove_missed_objects = False elif action == 'validate|import': save = True auto_fix_validation_errors = False remove_missed_objects = False elif action == 'import|auto_fix|retain_missing_objects': save = True auto_fix_validation_errors = True remove_missed_objects = False elif action == 'import|auto_fix|remove_missing_objects': save = True auto_fix_validation_errors = True remove_missed_objects = True else: raise RuntimeError('Unknown action') try: json_bytes = DbCache.get(document_type_config_csv_file['cache_key']) document_type = import_document_type(json_bytes=json_bytes, save=save, auto_fix_validation_errors=auto_fix_validation_errors, remove_missed_in_dump_objects=remove_missed_objects, task=self) finally: DbCache.clean_cache(document_type_config_csv_file['cache_key']) if not (save and update_cache): return from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING if not APP_VAR_DISABLE_RAW_DB_CACHING.val: self.log_info('Adapting RawDB table structure after import ...') adapt_table_structure(CeleryTaskLogger(self), document_type, force=False) ids = Document.all_objects.filter(document_type=document_type).values_list('pk', flat=True) ids = list(ids) self.log_info('Caching document field values ...') for chunk in chunks(ids, 50): self.run_sub_tasks('Cache field values for a set of documents', ImportDocumentType.cache_document_fields_for_doc_ids, [(list(chunk),)])
def normalize(task_id, key, value): DB_CACHED_FILE_LIMIT = 1024 * 1024 * 100 try: json.dumps(value) return value except TypeError: if isinstance(value, models.Model): return SimpleObjectSerializer().serialize([value]).pop() elif isinstance(value, QuerySet): return SimpleObjectSerializer().serialize(value) elif isinstance(value, (dict, list, tuple, set)): return pre_serialize(task_id, key, value) elif isinstance(value, UploadedFile): uploaded_file = value # type: UploadedFile if uploaded_file.size < DB_CACHED_FILE_LIMIT: cache_key = str(task_id) + '__' + str(key) if key else str( task_id) DbCache.put_to_db(cache_key, uploaded_file.read()) return { 'file_name': uploaded_file.name, 'cache_key': cache_key } else: file_ref = ExportFile() file_ref.created_time = datetime.datetime.utcnow() file_ref.expires_at = datetime.datetime.utcnow( ) + datetime.timedelta(hours=1) file_ref.comment = f'Import documents from "{len(uploaded_file.name)}" file' time_part = str(datetime.datetime.utcnow()).replace( '.', '_').replace(':', '_').replace(' ', '_') file_name = f'doc_export_{os.path.splitext(uploaded_file.name)[0]}_{time_part}.zip' storage = get_file_storage() docs_subfolder = storage.sub_path_join(storage.export_path, 'documents') try: storage.mkdir(docs_subfolder) except: pass file_ref.file_path = storage.sub_path_join( docs_subfolder, file_name) storage.write_file(file_ref.file_path, uploaded_file, uploaded_file.size) file_ref.file_created = True file_ref.stored_time = datetime.datetime.utcnow() file_ref.save() return {'file_ref_id': file_ref.pk} return str(value)
def normalize(task_id, key, value): try: json.dumps(value) return value except TypeError: if isinstance(value, models.Model): return SimpleObjectSerializer().serialize([value]).pop() elif isinstance(value, QuerySet): return SimpleObjectSerializer().serialize(value) elif isinstance(value, (dict, list, tuple, set)): return pre_serialize(task_id, key, value) elif isinstance(value, UploadedFile): uploaded_file = value # type: UploadedFile cache_key = str(task_id) + '__' + str(key) if key else str(task_id) DbCache.put_to_db(cache_key, uploaded_file.read()) return {'file_name': uploaded_file.name, 'cache_key': cache_key} return str(value)
def cache_geo_config(): geo_config = {} for name, pk, priority in GeoEntity.objects.values_list('name', 'pk', 'priority'): entity = dict_entities.entity_config(pk, name, priority or 0, name_is_alias=True) geo_config[pk] = entity for alias_id, alias_text, alias_type, entity_id, alias_lang \ in GeoAlias.objects.values_list('pk', 'alias', 'type', 'entity', 'locale'): entity = geo_config[entity_id] if entity: is_abbrev = alias_type.startswith('iso') or alias_type.startswith('abbrev') dict_entities.add_aliases_to_entity(entity, aliases_csv=alias_text, language=alias_lang, is_abbreviation=is_abbrev, alias_id=alias_id) res = list(geo_config.values()) DbCache.put_to_db(CACHE_KEY_GEO_CONFIG, res)
def process(self, document_field: Dict, config_csv_file: Dict, drop_previous_field_detectors: bool, update_field_choice_values: bool, csv_contains_regexps: bool, **kwargs): try: self.log_info('Going to configure simple field detection config...') document_field = DocumentField.objects.get(pk=document_field['pk']) csv_bytes = DbCache.get(config_csv_file['cache_key']) apply_simple_config(CeleryTaskLogger(self), document_field, csv_bytes, drop_previous_field_detectors, update_field_choice_values, csv_contains_regexps=csv_contains_regexps) finally: DbCache.clean_cache(config_csv_file['cache_key'])
def cache_term_stems(project_id=None): term_stems = {} terms_qs = Term.objects key = CACHE_KEY_TERM_STEMS if project_id is not None: qs = ProjectTermConfiguration.objects.filter(project_id=project_id) if qs.exists(): terms_qs = qs.last().terms key = CACHE_KEY_TERM_STEMS_PROJECT_PTN.format(project_id) for t, pk in terms_qs.values_list('term', 'pk'): stemmed_term = ' %s ' % ' '.join(get_stems(t)) stemmed_item = term_stems.get(stemmed_term, []) stemmed_item.append([t, pk]) term_stems[stemmed_term] = stemmed_item for item in term_stems: term_stems[item] = dict(values=term_stems[item], length=len(term_stems[item])) DbCache.put_to_db(key, term_stems)
def download_task_attached_file( document_import_file: Dict[str, Any]) -> Generator[str, None, None]: if 'cache_key' in document_import_file: # download from DB cache zip_bytes = DbCache.get(document_import_file['cache_key']) ext = os.path.splitext( document_import_file['file_name'])[1][1:].lower() _fd, fn = tempfile.mkstemp(suffix=ext) try: with open(fn, 'wb') as fw: fw.write(zip_bytes) yield fn # TODO: fix yield ... finally: DbCache.clean_cache(document_import_file['cache_key']) else: # download from file storage cache file_ref_id = document_import_file['file_ref_id'] file_ref = ExportFile.objects.get(pk=file_ref_id) # type: ExportFile storage = get_file_storage() with storage.get_as_local_fn(file_ref.file_path) as f_path: yield f_path[0]
def get_court_config(): return DbCache.get(CACHE_KEY_COURT_CONFIG)
def get_geo_config(): return DbCache.get(CACHE_KEY_GEO_CONFIG)
def get_term_config(): return DbCache.get(CACHE_KEY_TERM_STEMS)