def export_document_files(self, ids: List[int], target_path: str): storage = get_file_storage() file_paths = self.document_repository.get_document_source_paths_by_id( ids) for doc_id, file_path in file_paths: if not file_path: self.log_message( f"Document #{doc_id} doesn't have a link to the original file" ) continue doc_file_name = os.path.basename(file_path) new_name = f'{doc_id}_{doc_file_name}' target_filepath = os.path.join(target_path, new_name) try: doc_stor_path = storage.sub_path_join(storage.documents_path, file_path) file_obj = storage.read(doc_stor_path) if not file_obj: self.log_message( f'Exporting document file "{file_path}" was not found') continue with open(target_filepath, 'wb') as fw: fw.write(file_obj) except Exception as e: self.log_message(f'Error storing "{doc_file_name}": {e}')
def move_classifier(apps, schema_editor): # move "models/en/contract_class" folder's content # to models/en/contract_type_classifier/document src = 'models/en/contract_class' dest = 'models/en/contract_type_classifier/document' file_storage = get_file_storage() file_storage.ensure_folder_exists(dest) files_moved = 0 for file in file_storage.list(src): file_name_only = os.path.basename(file) dest_path = os.path.join(dest, file_name_only) file_storage.rename_file(file, dest_path, move_file=True) files_moved += 1 print(f'{files_moved} files are moved to "{dest}"') # create MLModel record with connection.cursor() as cursor: cursor.execute( f""" INSERT INTO analyze_mlmodel (name, version, vector_name, model_path, is_active, "default", apply_to, target_entity, language, project_id) VALUES ('Document contract class classifier (en)', '', '', %s, true, true, 'document', 'contract_type_classifier', 'en', null) ON CONFLICT DO NOTHING;""", [dest])
def rename_old_document(self, doc_id) -> None: doc = Document.all_objects.get(pk=doc_id) # type: Document new_name, new_path = self.make_new_doc_name(doc) # rename file and document itself from apps.common.file_storage import get_file_storage stor = get_file_storage() try: stor.rename_document(doc.source_path, new_path) self.log_func( f'ForceUnique: "{doc.source_path}" is renamed to "{new_path}"') except Exception as ex: self.log_func( f'ForceUnique: error while renaming "{doc.source_path}" to "{new_path}":\n' + str(ex)) # return # zombie document detected try: doc.source_path = new_path doc.name = os.path.basename(doc.source_path) doc.save() except Exception as ex: msg = f'ForceUnique: error while saving renamed doc at {doc.source_path}:\n' +\ str(ex) self.log_func(msg) raise Exception(msg) try: # "reindex" - update document's name in cache from apps.rawdb.field_value_tables import update_document_name update_document_name(doc.pk, doc.name) except Exception as ex: msg = f'ForceUnique: error updating RawDB cache (name) {doc.name}:\n' +\ str(ex) self.log_func(msg)
def delete_document_files(paths: List[str], logger: Callable = None) -> None: stor = get_file_storage() for path in paths: try: stor.delete_document(path) except Exception as e: msg = f'Unable to delete file "{path}" in {type(stor).__name__}' DocumentFilesCleaner.log_error(msg, e, logger)
def delete_document_files(paths: List[str]): stor = get_file_storage() for path in paths: try: stor.delete_document(path) except Exception as e: raise Exception(f'DocumentFilesCleaner: error deleting ' + f'"{path}": {e}') from e
def download_file_data(self, request, *_args, **kwargs): exp_file = ExportFile.objects.get( pk=kwargs['object_id']) # type: ExportFile storage = get_file_storage() file_data = storage.read(exp_file.file_path) file_name = os.path.basename(exp_file.file_path) response = HttpResponse(file_data, content_type='application/zip') response['Content-Disposition'] = f'attachment; filename="{file_name}"' response['Content-Length'] = len(file_data) response['filename'] = file_name exp_file.downloaded = True exp_file.save() return response
def get_notification_template_resource(rfn: str) -> Optional[bytes]: fn = os.path.normpath(os.path.join(settings.NOTIFICATION_CUSTOM_TEMPLATES_PATH_IN_MEDIA, rfn)) if not fn.startswith(settings.NOTIFICATION_CUSTOM_TEMPLATES_PATH_IN_MEDIA): raise RuntimeError('File name should be inside its parent dir: {0}'.format(rfn)) res = get_file_storage().read(fn) if res: return res fn = os.path.normpath(os.path.join(settings.NOTIFICATION_EMBEDDED_TEMPLATES_PATH, rfn)) if not fn.startswith(settings.NOTIFICATION_EMBEDDED_TEMPLATES_PATH): raise RuntimeError('File name should be inside its parent dir: {0}'.format(rfn)) with open(fn, 'br') as f: return f.read()
def normalize(task_id, key, value): DB_CACHED_FILE_LIMIT = 1024 * 1024 * 100 try: json.dumps(value) return value except TypeError: if isinstance(value, models.Model): return SimpleObjectSerializer().serialize([value]).pop() elif isinstance(value, QuerySet): return SimpleObjectSerializer().serialize(value) elif isinstance(value, (dict, list, tuple, set)): return pre_serialize(task_id, key, value) elif isinstance(value, UploadedFile): uploaded_file = value # type: UploadedFile if uploaded_file.size < DB_CACHED_FILE_LIMIT: cache_key = str(task_id) + '__' + str(key) if key else str( task_id) DbCache.put_to_db(cache_key, uploaded_file.read()) return { 'file_name': uploaded_file.name, 'cache_key': cache_key } else: file_ref = ExportFile() file_ref.created_time = datetime.datetime.utcnow() file_ref.expires_at = datetime.datetime.utcnow( ) + datetime.timedelta(hours=1) file_ref.comment = f'Import documents from "{len(uploaded_file.name)}" file' time_part = str(datetime.datetime.utcnow()).replace( '.', '_').replace(':', '_').replace(' ', '_') file_name = f'doc_export_{os.path.splitext(uploaded_file.name)[0]}_{time_part}.zip' storage = get_file_storage() docs_subfolder = storage.sub_path_join(storage.export_path, 'documents') try: storage.mkdir(docs_subfolder) except: pass file_ref.file_path = storage.sub_path_join( docs_subfolder, file_name) storage.write_file(file_ref.file_path, uploaded_file, uploaded_file.size) file_ref.file_created = True file_ref.stored_time = datetime.datetime.utcnow() file_ref.save() return {'file_ref_id': file_ref.pk} return str(value)
def __init__(self): self.calc = RatingCalculator() fstor = get_file_storage() extra_language_paths = fstor.list(CUSTOM_LANG_STORAGE_FOLDER) for file_path in extra_language_paths: file_data = fstor.read(file_path) with tempfile.NamedTemporaryFile() as fw: fw.write(file_data) lang_df = pandas.read_pickle(fw.name) lang, _ = os.path.splitext(os.path.basename(file_path)) self.calc.distribution_by_lang[lang] = lang_df # load default lang features self.calc.init_language_data([ os.path.join(os.path.dirname(lexnlp_ocr_path.__file__), './reference_vectors') ])
def create_document(task: ExtendedTask, uri: str, project_id, run_detect_field_values): file_storage = get_file_storage() with file_storage.get_document_as_local_fn(uri) as (fn, file_name): task.task.title = 'Load Document: {0}'.format(uri) task.log_extra = {'log_document_name': uri} with open(fn, encoding='utf-8') as data_file: data = json.loads(data_file.read()) project = Project.objects.get(pk=project_id) document_type = project.type document = Document( name=file_name, project=project, document_type=document_type, metadata={'parsed_by': None} ) LoadDocumentWithFields.load_doc(task, document, data, run_detect_field_values)
def batch_upload(self, request, **kwargs): """ Upload files from given sub-folder in media/data/documents folder\n Params: - source_path: relative path to a folder with documents - send_email_notifications: bool (optional) - sent notification email that batch uploading started """ session = self.get_object() session_id = session.pk project = session.project folder_name = request.POST.get('folder') or request.POST.get( 'source_path') if not session_id or not folder_name: raise ValidationError('Provide session id and folder name.') file_list = get_file_storage().list_documents(folder_name) # TODO: limit file size - see def upload() for file_path in file_list: file_name = os.path.basename(file_path) # Code for running locators and detecting field values has been moved to LoadDocuments task # for the unification purposes between old and new ui. call_task(task_name='LoadDocuments', source_data=file_path, user_id=request.user.id, session_id=session_id, metadata={ 'session_id': session_id, 'file_name': file_name }, run_standard_locators=True, linked_tasks=None) if project.send_email_notification and \ request.POST.get('send_email_notifications') == 'true' and \ not session.notified_upload_started: self._notify_upload_started(session) return Response('Started')
def download_task_attached_file( document_import_file: Dict[str, Any]) -> Generator[str, None, None]: if 'cache_key' in document_import_file: # download from DB cache zip_bytes = DbCache.get(document_import_file['cache_key']) ext = os.path.splitext( document_import_file['file_name'])[1][1:].lower() _fd, fn = tempfile.mkstemp(suffix=ext) try: with open(fn, 'wb') as fw: fw.write(zip_bytes) yield fn # TODO: fix yield ... finally: DbCache.clean_cache(document_import_file['cache_key']) else: # download from file storage cache file_ref_id = document_import_file['file_ref_id'] file_ref = ExportFile.objects.get(pk=file_ref_id) # type: ExportFile storage = get_file_storage() with storage.get_as_local_fn(file_ref.file_path) as f_path: yield f_path[0]
def import_doc_files(self): storage = get_file_storage() file_ptrn = re.compile(r'^\d+_.*') for name_only in os.listdir(self.source_path): if not file_ptrn.match(name_only): continue doc_id = int(name_only.split('_')[0]) doc_id = self.document_ids.get(doc_id) if not doc_id: self.log_error( f'File "{name_only}" - migrated doc was not found') continue dest_file_path = self.document_src_paths.get(doc_id) if not dest_file_path: self.log_error( f'File "{name_only}", #{doc_id} - document source path was not found' ) continue if storage.document_exists(dest_file_path): self.log_info(f'Document "{dest_file_path}" already exists') continue src_file_path = os.path.join(self.source_path, name_only) with open(src_file_path, 'rb') as fr: content = fr.read() # ensure the subfolder exists doc_folder = os.path.dirname(dest_file_path) if doc_folder: try: storage.mk_doc_dir(doc_folder) except: # folder might be already created pass try: storage.write_document(dest_file_path, content, len(content)) except Exception as e: self.log_error(f'Error storing file "{dest_file_path}": {e}') raise
def process(self, **kwargs): self.log_info('Going to load document with fields...') document_name = kwargs.get('document_name') project = Project.objects.get(pk=kwargs.get('project_id')) # type: Project run_detect_field_values = bool(kwargs.get('run_detect_field_values')) document_fields = kwargs.get('document_fields') or {} # type: Dict file_storage = get_file_storage() if document_fields: document = Document( name=document_name, project=project, document_type=project.type, ) LoadDocumentWithFields.load_doc(self, document, document_fields, run_detect_field_values) path = kwargs['source_data'] if path: self.log_info('Parse {0} at {1}'.format(path, file_storage)) file_list = file_storage.list_documents(path) self.log_info("Detected {0} files. Added {0} subtasks.".format(len(file_list))) if len(file_list) == 0: raise RuntimeError('Wrong file or directory name or directory is empty: {}' .format(path)) load_docs_args = [(file_path, project.pk, run_detect_field_values) for file_path in file_list] self.run_sub_tasks('Load Each Document', LoadDocumentWithFields.create_document, load_docs_args, file_list)
# Project imports from apps.common.file_storage import get_file_storage from apps.document.repository.document_field_repository import DocumentFieldRepository from apps.document.repository.document_repository import DocumentRepository from apps.task.models import Task from apps.task.tasks import purge_task __author__ = "ContraxSuite, LLC; LexPredict, LLC" __copyright__ = "Copyright 2015-2020, ContraxSuite, LLC" __license__ = "https://github.com/LexPredict/lexpredict-contraxsuite/blob/1.7.0/LICENSE" __version__ = "1.7.0" __maintainer__ = "LexPredict, LLC" __email__ = "*****@*****.**" file_storage = get_file_storage() def cleanup_document_relations(document): # 1. delete history document_repo = DocumentRepository() field_repo = DocumentFieldRepository() document_repo.delete_document_history_by_ids([document.pk]) field_repo.delete_document_history_values(document.pk) # INFO: skip "delete step" (set delete=False) since we clean tasks periodically now # 2. delete Tasks, Task history, TaskResults, child tasks if document.metadata and document.metadata.get('cascade_delete_tasks', True): task_kwargs = dict(file_name=document.name)
def get_file_storage(cls) -> ContraxsuiteFileStorage: if not cls.file_storage: cls.file_storage = get_file_storage() return cls.file_storage
def delete_document_files(paths: List[str]): stor = get_file_storage() for path in paths: stor.delete_document(path)
def sync_imanage_document(task: ExtendedTask, imanage_config_id: int, imanage_doc_id: str): task.log_info('Synchronizing iManage document #{0} or config #{1}'.format(imanage_doc_id, imanage_config_id)) imanage_doc = IManageDocument.objects \ .filter(imanage_config_id=imanage_config_id, imanage_doc_id=imanage_doc_id) \ .select_related('imanage_config').get() file_storage = get_file_storage() try: imanage_config = imanage_doc.imanage_config log = CeleryTaskLogger(task) project = imanage_config.resolve_dst_project(imanage_doc.imanage_doc_data, log) project_id = project.pk assignee = imanage_config.resolve_assignee(imanage_doc.imanage_doc_data, log) assignee_id = assignee.pk if assignee else None task.log_info('Assignee resolved to: {0}'.format(assignee.get_full_name() if assignee else '<no assignee>')) task.log_info('Downloading iManage document contents into a temp file...') auth_token = imanage_config.login() filename, response = imanage_config.load_document(auth_token, imanage_doc_id) upload_session_id = str(uuid.uuid4()) filename = get_valid_filename(filename) rel_filepath = os.path.join(upload_session_id, filename) _, ext = os.path.splitext(filename) if filename else None with buffer_contents_into_temp_file(response, ext) as temp_fn: # upload file to file storage with open(temp_fn, 'rb') as f: file_storage.mk_doc_dir(upload_session_id) file_storage.write_document(rel_filepath, f) kwargs = { 'document_type_id': imanage_config.document_type_id, 'project_id': project_id, 'assignee_id': assignee_id, 'user_id': get_main_admin_user().pk, 'propagate_exception': True, 'run_standard_locators': True, 'metadata': {}, 'do_not_check_exists': True } pre_defined_fields = None if imanage_doc.imanage_doc_data and imanage_config.imanage_to_contraxsuite_field_binding: pre_defined_fields = dict() for imanage_field_code, contraxsuite_field_code \ in dict(imanage_config.imanage_to_contraxsuite_field_binding).items(): imanage_field_value = imanage_doc.imanage_doc_data.get(imanage_field_code) if imanage_field_value: pre_defined_fields[contraxsuite_field_code] = imanage_field_value task.log_info('Assigning iManage field {0} to Contraxsuite field {1}: {2}' .format(imanage_field_code, contraxsuite_field_code, imanage_field_value)) else: task.log_info('iManage field {0} has no value assigned.' .format(imanage_field_code)) else: task.log_info('No binding of iManage fields to Contraxsuite fields.') document_id = LoadDocuments \ .create_document_local(task, temp_fn, rel_filepath, kwargs, return_doc_id=True, pre_defined_doc_fields_code_to_python_val=pre_defined_fields) if document_id: task.log_info('Created Contraxsuite document #{0}'.format(document_id)) imanage_doc.document_id = document_id imanage_doc.last_sync_date = timezone.now() imanage_doc.save(update_fields=['document_id', 'last_sync_date']) else: task.log_error('Unable to create Contraxsuite document for ' 'iManage document #{0}'.format(imanage_doc_id)) raise RuntimeError('No document loaded.') except Exception as ex: msg = render_error('Unable to synchronize iManage document #{0}'.format(imanage_doc_id), ex) task.log_error(msg) imanage_doc.import_problem = True imanage_doc.save(update_fields=['import_problem'])