Ejemplo n.º 1
0
def make_records_untyped(_apps, _schema_editor):
    for cache_key in CACHE_KEYS:
        untyped = []
        records = DbCache.get(cache_key)
        for record in records:
            if record.__class__.__name__ != 'DictionaryEntry':
                untyped.append(record)
                continue
            try:
                aliases = [(a.alias, a.language, a.is_abbreviation, a.alias_id,
                            a.normalized_alias) for a in record.aliases]
                rec = (
                    record.id,
                    record.name,
                    record.priority,
                    aliases,
                )
                untyped.append(rec)
            except Exception as e:
                print(
                    f'Unable to cast a DictionaryEntry in "{cache_key}" to a tuple: {e}'
                )

        DbCache.put_to_db(cache_key, untyped)
    if DbCache.INSTANCE:
        DbCache.INSTANCE.stop_watching()
Ejemplo n.º 2
0
def make_records_typed(_apps, _schema_editor):
    for cache_key in CACHE_KEYS:
        typed = []  # type: List[DictionaryEntry]
        records = DbCache.get(cache_key)
        for record in records:
            if record.__class__.__name__ == 'DictionaryEntry':
                typed.append(record)
                continue
            try:
                aliases = [
                    DictionaryEntryAlias(alias, lang, is_abbr, alias_id,
                                         norm_als)
                    for alias, lang, is_abbr, alias_id, norm_als in record[3]
                ]
                rec = DictionaryEntry(record[0],
                                      record[1],
                                      priority=record[2],
                                      aliases=aliases)
                typed.append(rec)
            except Exception as e:
                print(
                    f'Unable to cast a record in "{cache_key}" to DictionaryEntry: {e}'
                )

        DbCache.put_to_db(cache_key, typed)
    if DbCache.INSTANCE:
        DbCache.INSTANCE.stop_watching()
def cache_court_config():
    res = [dict_entities.entity_config(
        entity_id=i.id,
        name=i.name,
        priority=0,
        aliases=i.alias.split(';') if i.alias else []
    ) for i in Court.objects.all()]
    DbCache.put_to_db(CACHE_KEY_COURT_CONFIG, res)
def get_term_config(project_id=None):
    res = None
    if project_id is not None:
        key = CACHE_KEY_TERM_STEMS_PROJECT_PTN.format(project_id)
        res = DbCache.get(key)
    if res is None:
        res = DbCache.get(CACHE_KEY_TERM_STEMS)
    return res
def cache_term_stems():
    term_stems = {}
    for t, pk in Term.objects.values_list('term', 'pk'):
        stemmed_term = ' %s ' % ' '.join(get_stems(t))
        stemmed_item = term_stems.get(stemmed_term, [])
        stemmed_item.append([t, pk])
        term_stems[stemmed_term] = stemmed_item
    for item in term_stems:
        term_stems[item] = dict(values=term_stems[item],
                                length=len(term_stems[item]))
    DbCache.put_to_db(CACHE_KEY_TERM_STEMS, term_stems)
Ejemplo n.º 6
0
    def process(self,
                document_type_config_csv_file: Dict,
                action: str,
                update_cache: bool,
                **kwargs):

        if action == 'validate':
            save = False
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'validate|import':
            save = True
            auto_fix_validation_errors = False
            remove_missed_objects = False
        elif action == 'import|auto_fix|retain_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = False
        elif action == 'import|auto_fix|remove_missing_objects':
            save = True
            auto_fix_validation_errors = True
            remove_missed_objects = True
        else:
            raise RuntimeError('Unknown action')

        try:
            json_bytes = DbCache.get(document_type_config_csv_file['cache_key'])
            document_type = import_document_type(json_bytes=json_bytes,
                                                 save=save,
                                                 auto_fix_validation_errors=auto_fix_validation_errors,
                                                 remove_missed_in_dump_objects=remove_missed_objects,
                                                 task=self)
        finally:
            DbCache.clean_cache(document_type_config_csv_file['cache_key'])

        if not (save and update_cache):
            return

        from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
        if not APP_VAR_DISABLE_RAW_DB_CACHING.val:
            self.log_info('Adapting RawDB table structure after import ...')
            adapt_table_structure(CeleryTaskLogger(self), document_type, force=False)
        ids = Document.all_objects.filter(document_type=document_type).values_list('pk', flat=True)
        ids = list(ids)
        self.log_info('Caching document field values ...')

        for chunk in chunks(ids, 50):
            self.run_sub_tasks('Cache field values for a set of documents',
                               ImportDocumentType.cache_document_fields_for_doc_ids,
                               [(list(chunk),)])
Ejemplo n.º 7
0
def normalize(task_id, key, value):
    DB_CACHED_FILE_LIMIT = 1024 * 1024 * 100
    try:
        json.dumps(value)
        return value
    except TypeError:
        if isinstance(value, models.Model):
            return SimpleObjectSerializer().serialize([value]).pop()
        elif isinstance(value, QuerySet):
            return SimpleObjectSerializer().serialize(value)
        elif isinstance(value, (dict, list, tuple, set)):
            return pre_serialize(task_id, key, value)
        elif isinstance(value, UploadedFile):
            uploaded_file = value  # type: UploadedFile
            if uploaded_file.size < DB_CACHED_FILE_LIMIT:
                cache_key = str(task_id) + '__' + str(key) if key else str(
                    task_id)
                DbCache.put_to_db(cache_key, uploaded_file.read())
                return {
                    'file_name': uploaded_file.name,
                    'cache_key': cache_key
                }
            else:
                file_ref = ExportFile()
                file_ref.created_time = datetime.datetime.utcnow()
                file_ref.expires_at = datetime.datetime.utcnow(
                ) + datetime.timedelta(hours=1)
                file_ref.comment = f'Import documents from "{len(uploaded_file.name)}" file'
                time_part = str(datetime.datetime.utcnow()).replace(
                    '.', '_').replace(':', '_').replace(' ', '_')
                file_name = f'doc_export_{os.path.splitext(uploaded_file.name)[0]}_{time_part}.zip'

                storage = get_file_storage()
                docs_subfolder = storage.sub_path_join(storage.export_path,
                                                       'documents')
                try:
                    storage.mkdir(docs_subfolder)
                except:
                    pass
                file_ref.file_path = storage.sub_path_join(
                    docs_subfolder, file_name)
                storage.write_file(file_ref.file_path, uploaded_file,
                                   uploaded_file.size)
                file_ref.file_created = True
                file_ref.stored_time = datetime.datetime.utcnow()
                file_ref.save()
                return {'file_ref_id': file_ref.pk}

        return str(value)
Ejemplo n.º 8
0
def normalize(task_id, key, value):
    try:
        json.dumps(value)
        return value
    except TypeError:
        if isinstance(value, models.Model):
            return SimpleObjectSerializer().serialize([value]).pop()
        elif isinstance(value, QuerySet):
            return SimpleObjectSerializer().serialize(value)
        elif isinstance(value, (dict, list, tuple, set)):
            return pre_serialize(task_id, key, value)
        elif isinstance(value, UploadedFile):
            uploaded_file = value  # type: UploadedFile
            cache_key = str(task_id) + '__' + str(key) if key else str(task_id)
            DbCache.put_to_db(cache_key, uploaded_file.read())
            return {'file_name': uploaded_file.name, 'cache_key': cache_key}
        return str(value)
def cache_geo_config():
    geo_config = {}
    for name, pk, priority in GeoEntity.objects.values_list('name', 'pk', 'priority'):
        entity = dict_entities.entity_config(pk, name, priority or 0, name_is_alias=True)
        geo_config[pk] = entity
    for alias_id, alias_text, alias_type, entity_id, alias_lang \
            in GeoAlias.objects.values_list('pk', 'alias', 'type', 'entity', 'locale'):
        entity = geo_config[entity_id]
        if entity:
            is_abbrev = alias_type.startswith('iso') or alias_type.startswith('abbrev')
            dict_entities.add_aliases_to_entity(entity,
                                                aliases_csv=alias_text,
                                                language=alias_lang,
                                                is_abbreviation=is_abbrev,
                                                alias_id=alias_id)
    res = list(geo_config.values())
    DbCache.put_to_db(CACHE_KEY_GEO_CONFIG, res)
Ejemplo n.º 10
0
 def process(self,
             document_field: Dict,
             config_csv_file: Dict,
             drop_previous_field_detectors: bool,
             update_field_choice_values: bool,
             csv_contains_regexps: bool,
             **kwargs):
     try:
         self.log_info('Going to configure simple field detection config...')
         document_field = DocumentField.objects.get(pk=document_field['pk'])
         csv_bytes = DbCache.get(config_csv_file['cache_key'])
         apply_simple_config(CeleryTaskLogger(self),
                             document_field,
                             csv_bytes,
                             drop_previous_field_detectors,
                             update_field_choice_values,
                             csv_contains_regexps=csv_contains_regexps)
     finally:
         DbCache.clean_cache(config_csv_file['cache_key'])
def cache_term_stems(project_id=None):
    term_stems = {}

    terms_qs = Term.objects
    key = CACHE_KEY_TERM_STEMS

    if project_id is not None:
        qs = ProjectTermConfiguration.objects.filter(project_id=project_id)
        if qs.exists():
            terms_qs = qs.last().terms
            key = CACHE_KEY_TERM_STEMS_PROJECT_PTN.format(project_id)

    for t, pk in terms_qs.values_list('term', 'pk'):
        stemmed_term = ' %s ' % ' '.join(get_stems(t))
        stemmed_item = term_stems.get(stemmed_term, [])
        stemmed_item.append([t, pk])
        term_stems[stemmed_term] = stemmed_item
    for item in term_stems:
        term_stems[item] = dict(values=term_stems[item],
                                length=len(term_stems[item]))
    DbCache.put_to_db(key, term_stems)
Ejemplo n.º 12
0
def download_task_attached_file(
        document_import_file: Dict[str, Any]) -> Generator[str, None, None]:
    if 'cache_key' in document_import_file:
        # download from DB cache
        zip_bytes = DbCache.get(document_import_file['cache_key'])
        ext = os.path.splitext(
            document_import_file['file_name'])[1][1:].lower()
        _fd, fn = tempfile.mkstemp(suffix=ext)
        try:
            with open(fn, 'wb') as fw:
                fw.write(zip_bytes)
                yield fn  # TODO: fix yield ...
        finally:
            DbCache.clean_cache(document_import_file['cache_key'])
    else:
        # download from file storage cache
        file_ref_id = document_import_file['file_ref_id']
        file_ref = ExportFile.objects.get(pk=file_ref_id)  # type: ExportFile
        storage = get_file_storage()
        with storage.get_as_local_fn(file_ref.file_path) as f_path:
            yield f_path[0]
def get_court_config():
    return DbCache.get(CACHE_KEY_COURT_CONFIG)
def get_geo_config():
    return DbCache.get(CACHE_KEY_GEO_CONFIG)
def get_term_config():
    return DbCache.get(CACHE_KEY_TERM_STEMS)