def load_doc(task: ExtendedTask, document: Document, document_fields: Dict, run_detect_field_values: bool, filed_owners: dict = None): filed_owners = filed_owners if filed_owners else {} fields_to_values = LoadDocumentWithFields.load_field_values(task, document, document_fields, filed_owners) log = CeleryTaskLogger(task) with transaction.atomic(): new_document = document.pk is None document.save(force_insert=new_document) if not new_document: DocumentFieldValue.objects \ .filter(document=document, removed_by_user=False, created_by__isnull=True, modified_by__isnull=True) \ .delete() for field, values in fields_to_values.items(): field_detection.save_detected_values(document, field, values) if run_detect_field_values: field_detection.detect_and_cache_field_values_for_document(log, document, True) else: dfvs = field_detection.detect_and_cache_field_values_for_document(log, document, False) field_value_cache.cache_field_values(document, dfvs, save=True) task.log_info('Loaded {0} field values for document #{1} ({2})' .format(len(fields_to_values), document.pk, document.name))
def import_document(self, values: Dict[str, Any]): doc = Document() doc.name = values['name'] doc.description = values['description'] doc.source = values['source'] doc.source_type = values['source_type'] doc.paragraphs = values['paragraphs'] doc.sentences = values['sentences'] doc.title = values['title'] doc.document_type_id = self.document_types[str( values['document_type_id'])] doc.project_id = self.project_ids[values['project_id']] doc.status_id = str(values['status_id']) doc.language = values['language'] doc.file_size = values['file_size'] if not pd.isnull(values['assign_date']): doc.assign_date = values['assign_date'] doc.delete_pending = values['delete_pending'] == 't' doc.processed = values['processed'] == 't' doc.folder = values['folder'] doc.document_class = values['document_class'] doc.fields_dirty = values['fields_dirty'] if not pd.isnull(values['assignee_id']): doc.assignee = self.target_user doc.source_path = values['source_path'] doc.save() self.document_ids[values['id']] = doc.pk self.document_src_paths[doc.pk] = doc.source_path self.initially_loaded_docs.append(doc.pk)
def cache_field_values(doc: Document, suggested_field_values: Optional[ List[DetectedFieldValue]], save: bool = True) -> Dict[str, Any]: """ Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document, converts them to the sortable DB-aware form and saves them to Document.field_values. :param doc: :param save: :param suggested_field_values: :return: """ document_type = doc.document_type # type: DocumentType # TODO: get/save field value for specific field all_fields = list(document_type.fields.all()) fields_to_field_values = {f: None for f in all_fields} for fv in doc.documentfieldvalue_set.all(): if fv.removed_by_user: continue field = fv.field field_type = FIELD_TYPES_REGISTRY[fv.field.type] # type: FieldType fields_to_field_values[field] = field_type \ .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value) field_uids_to_field_values_db = {} for f in all_fields: # type: DocumentField field_type = FIELD_TYPES_REGISTRY[f.type] # type: FieldType v = fields_to_field_values[f] field_uids_to_field_values_db[ f.uid] = field_type.merged_python_value_to_db(v) if suggested_field_values: field_codes_to_suggested_values = \ merge_detected_field_values_to_python_value(suggested_field_values) # type: Dict[str, Any] else: field_codes_to_suggested_values = None for f in all_fields: # type: DocumentField field_type = f.get_field_type() # type: FieldType if f.is_detectable(): suggested_field_uid = Document.get_suggested_field_uid(f.uid) if field_codes_to_suggested_values: suggested_value_db = field_type.merged_python_value_to_db( field_codes_to_suggested_values.get(f.code)) else: suggested_value_db = doc.field_values.get( suggested_field_uid) if doc.field_values else None field_uids_to_field_values_db[ suggested_field_uid] = suggested_value_db if save: doc.field_values = field_uids_to_field_values_db doc.save() return field_uids_to_field_values_db
def cache_generic_values(doc: Document, save: bool = True, log: ProcessLogger = None): doc.generic_data = get_generic_values(doc) if save: doc.save(update_fields=['generic_data']) events.on_document_change(events.DocumentChangedEvent(log=log, document=doc, system_fields_changed=False, generic_fields_changed=True, user_fields_changed=False, pre_detected_field_values=None))
def cache_generic_values(doc: Document, save: bool = True, log: ProcessLogger = None, fire_doc_changed_event: bool = True): doc.generic_data = get_generic_values(doc) if save: doc.save(update_fields=['generic_data']) if fire_doc_changed_event: signals.fire_document_changed(sender=cache_generic_values, log=log, document=doc, system_fields_changed=False, generic_fields_changed=True, user_fields_changed=False, pre_detected_field_values=None)
def cache_generic_values(doc: Document, save: bool = True): document_qs = Document.objects.filter(pk=doc.pk) \ .annotate(cluster_id=Max('documentcluster'), parties=StringAgg('textunit__partyusage__party__name', delimiter=', ', distinct=True), max_currency_amount=Max('textunit__currencyusage__amount'), max_currency_name=Max('textunit__currencyusage__currency'), min_date=Min('textunit__dateusage__date'), max_date=Max('textunit__dateusage__date')) values = document_qs.values('cluster_id', 'parties', 'max_currency_amount', 'max_currency_name', 'min_date', 'max_date').first() doc.generic_data = values if save: doc.save(update_fields=['generic_data'])
def load_doc(task: ExtendedTask, document: Document, field_values_alias_to_value: Dict[str, Any], run_detect_field_values: bool, field_owners: Dict[str, User] = None): field_owners = field_owners if field_owners else {} fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value) log = CeleryTaskLogger(task) import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() with transaction.atomic(): new_document = document.pk is None document.save(force_insert=new_document) DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None}) for field, value_dto in fields_to_values.items(): field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=value_dto, user=field_owners.get(field.code)) if run_detect_field_values: field_detection.detect_and_cache_field_values_for_document(log=log, document=document, save=True, clear_old_values=False) else: signals.fire_document_changed(sender=task, log=log, document=document, changed_by_user=None, document_initial_load=True, system_fields_changed=True, generic_fields_changed=True, user_fields_changed=True) task.log_info('Loaded {0} field values for document #{1} ({2}): {3}' .format(len(fields_to_values), document.pk, document.name, ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
def cache_field_values(doc: Document, suggested_field_values: Optional[ List[DetectedFieldValue]], save: bool = True, log: ProcessLogger = None) -> Dict[str, Any]: """ Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document, converts them to the sortable DB-aware form and saves them to Document.field_values. :param doc: :param save: :param suggested_field_values: :param log :return: """ document_type = doc.document_type # type: DocumentType # TODO: get/save field value for specific field all_fields = list(document_type.fields.all()) related_info_field_uids = { f.uid for f in all_fields if f.is_related_info_field() } fields_to_field_values = {f: None for f in all_fields} for fv in doc.documentfieldvalue_set.all(): if fv.removed_by_user: continue field = fv.field field_type = FIELD_TYPES_REGISTRY[fv.field.type] # type: FieldType fields_to_field_values[field] = field_type \ .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value) field_uids_to_field_values_db = {} for f in all_fields: # type: DocumentField field_type = FIELD_TYPES_REGISTRY[f.type] # type: FieldType v = fields_to_field_values[f] field_uids_to_field_values_db[ f.uid] = field_type.merged_python_value_to_db(v) if suggested_field_values: field_codes_to_suggested_values = \ merge_detected_field_values_to_python_value(suggested_field_values) # type: Dict[str, Any] else: field_codes_to_suggested_values = None for f in all_fields: # type: DocumentField field_type = f.get_field_type() # type: FieldType if f.is_detectable(): suggested_field_uid = Document.get_suggested_field_uid(f.uid) if field_codes_to_suggested_values: suggested_value_db = field_type.merged_python_value_to_db( field_codes_to_suggested_values.get(f.code)) else: suggested_value_db = doc.field_values.get( suggested_field_uid) if doc.field_values else None field_uids_to_field_values_db[ suggested_field_uid] = suggested_value_db if save: doc.field_values = { uid: len(value) if uid in related_info_field_uids and value is not None else value for uid, value in field_uids_to_field_values_db.items() } doc.save() events.on_document_change( events.DocumentChangedEvent( log=log, document=doc, system_fields_changed=False, generic_fields_changed=False, user_fields_changed=True, pre_detected_field_values=field_codes_to_suggested_values)) return field_uids_to_field_values_db