Example #1
0
def cache_field_values(doc: Document,
                       suggested_field_values: Optional[
                           List[DetectedFieldValue]],
                       save: bool = True) -> Dict[str, Any]:
    """
    Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document,
    converts them to the sortable DB-aware form and saves them to Document.field_values.
    :param doc:
    :param save:
    :param suggested_field_values:
    :return:
    """
    document_type = doc.document_type  # type: DocumentType
    # TODO: get/save field value for specific field
    all_fields = list(document_type.fields.all())

    fields_to_field_values = {f: None for f in all_fields}

    for fv in doc.documentfieldvalue_set.all():
        if fv.removed_by_user:
            continue

        field = fv.field
        field_type = FIELD_TYPES_REGISTRY[fv.field.type]  # type: FieldType
        fields_to_field_values[field] = field_type \
            .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value)

    field_uids_to_field_values_db = {}

    for f in all_fields:  # type: DocumentField
        field_type = FIELD_TYPES_REGISTRY[f.type]  # type: FieldType
        v = fields_to_field_values[f]
        field_uids_to_field_values_db[
            f.uid] = field_type.merged_python_value_to_db(v)

    if suggested_field_values:
        field_codes_to_suggested_values = \
            merge_detected_field_values_to_python_value(suggested_field_values)  # type: Dict[str, Any]
    else:
        field_codes_to_suggested_values = None

    for f in all_fields:  # type: DocumentField
        field_type = f.get_field_type()  # type: FieldType
        if f.is_detectable():
            suggested_field_uid = Document.get_suggested_field_uid(f.uid)
            if field_codes_to_suggested_values:
                suggested_value_db = field_type.merged_python_value_to_db(
                    field_codes_to_suggested_values.get(f.code))
            else:
                suggested_value_db = doc.field_values.get(
                    suggested_field_uid) if doc.field_values else None

            field_uids_to_field_values_db[
                suggested_field_uid] = suggested_value_db

    if save:
        doc.field_values = field_uids_to_field_values_db
        doc.save()

    return field_uids_to_field_values_db
def _fill_system_fields_to_python_values(document: Document,
                                         field_to_python_values: Dict[str,
                                                                      List]):
    field_to_python_values[FIELD_CODE_DOC_ID] = document.id
    field_to_python_values[FIELD_CODE_DOC_NAME] = document.name
    field_to_python_values[FIELD_CODE_DOC_TITLE] = document.title
    field_to_python_values[FIELD_CODE_IS_REVIEWED] = document.is_reviewed()
    field_to_python_values[FIELD_CODE_IS_COMPLETED] = document.is_completed()
    field_to_python_values[FIELD_CODE_DOC_FULL_TEXT] = \
        document.full_text[:settings.RAW_DB_FULL_TEXT_SEARCH_CUT_ABOVE_TEXT_LENGTH] if document.full_text else None
    field_to_python_values[FIELD_CODE_DOC_FULL_TEXT_LENGTH] = len(
        document.full_text) if document.full_text else 0
    project = document.project
    field_to_python_values[
        FIELD_CODE_PROJECT_ID] = project.pk if project is not None else None
    field_to_python_values[
        FIELD_CODE_PROJECT_NAME] = project.name if project is not None else None
    field_to_python_values[FIELD_CODE_ASSIGNEE_ID] = document.assignee_id
    field_to_python_values[
        FIELD_CODE_ASSIGNEE_NAME] = document.assignee.get_full_name(
        ) if document.assignee else None
    field_to_python_values[FIELD_CODE_CREATE_DATE] = document.history.last(
    ).history_date
    field_to_python_values[FIELD_CODE_ASSIGN_DATE] = document.assign_date
    field_to_python_values[
        FIELD_CODE_STATUS_NAME] = document.status.name if document.status else None
Example #3
0
    def test_detect_field_value(self):
        doc = Document()
        doc.pk = 'A'
        CsvRegexpsFieldDetectionStrategyMock.text_by_doc_id = {
            doc.pk:
            """
            Collateral: Enigma Corp
            Client ref: "Diane" D.O.O. 
            """
        }

        found_entity = CsvRegexpsFieldDetectionStrategyMock.detect_field_value(
            logger, doc, doc_field, {})
        self.assertIsNone(found_entity)

        CsvRegexpsFieldDetectionStrategyMock.text_by_doc_id = {
            doc.pk:
            """
                    Collateral: Family Name  (173437) 
                    Client ref: "Diane" D.O.O. 
                    """
        }

        found_entity = CsvRegexpsFieldDetectionStrategyMock.detect_field_value(
            logger, doc, doc_field, {})
        self.assertIsNotNone(found_entity)
Example #4
0
    def load_doc(task: ExtendedTask, document: Document, document_fields: Dict, run_detect_field_values: bool,
                 filed_owners: dict = None):
        filed_owners = filed_owners if filed_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, document_fields, filed_owners)
        log = CeleryTaskLogger(task)

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            if not new_document:
                DocumentFieldValue.objects \
                    .filter(document=document,
                            removed_by_user=False,
                            created_by__isnull=True,
                            modified_by__isnull=True) \
                    .delete()

            for field, values in fields_to_values.items():
                field_detection.save_detected_values(document, field, values)

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log, document, True)
            else:
                dfvs = field_detection.detect_and_cache_field_values_for_document(log, document, False)
                field_value_cache.cache_field_values(document, dfvs, save=True)

        task.log_info('Loaded {0} field values for document #{1} ({2})'
                      .format(len(fields_to_values), document.pk, document.name))
Example #5
0
    def process(self, **kwargs):
        ant_uids = kwargs.get('ids')
        status_id = kwargs.get('status_id')

        # for preventing "connection already closed"
        TaskUtils.prepare_task_execution()
        ann_status = FieldAnnotationStatus.objects.get(pk=status_id)
        user = User.objects.get(pk=kwargs.get('user_id'))

        true_annotations = FieldAnnotation.objects.filter(uid__in=ant_uids)
        false_annotations = FieldAnnotationFalseMatch.objects.filter(uid__in=ant_uids)

        if ann_status.is_rejected:
            from apps.document.repository.document_field_repository import DocumentFieldRepository
            field_repo = DocumentFieldRepository()
            for ant in true_annotations:
                field_repo.delete_field_annotation_and_update_field_value(ant, user)
        else:
            import apps.document.repository.document_field_repository as dfr
            field_repo = dfr.DocumentFieldRepository()
            field_repo.update_field_annotations_by_ant_ids(
                ant_uids, [(f'{FIELD_CODE_STATUS_ID}', status_id)])

            if false_annotations:
                for false_ant in false_annotations:
                    field_repo.restore_field_annotation_and_update_field_value(
                        false_ant, status_id, user)

        ant_docs = set(FieldAnnotation.objects.filter(
            uid__in=ant_uids).values_list('document_id', flat=True))
        false_ant_docs = set(FieldAnnotationFalseMatch.objects.filter(
            uid__in=ant_uids).values_list('document_id', flat=True))
        ant_docs.update(false_ant_docs)
        Document.reset_status_from_annotations(ann_status=ann_status,
                                               document_ids=list(ant_docs))
def cache_generic_values(doc: Document, save: bool = True,
                         log: ProcessLogger = None):
    doc.generic_data = get_generic_values(doc)

    if save:
        doc.save(update_fields=['generic_data'])
        events.on_document_change(events.DocumentChangedEvent(log=log,
                                                              document=doc,
                                                              system_fields_changed=False,
                                                              generic_fields_changed=True,
                                                              user_fields_changed=False,
                                                              pre_detected_field_values=None))
Example #7
0
    def make_documents(cls, doc_ids: List[int]):
        texts = TEST_TEXTS

        docs: List[Document] = []
        text_index = 0
        for id in doc_ids:
            doc = Document()
            doc.pk = id
            doc.project_id = 1
            doc.documenttext = DocumentText()
            doc.documenttext.full_text = texts[text_index]
            text_index += 1
            docs.append(doc)
        return docs
Example #8
0
    def pdf_by_document(self, document: Document):
        limit = document.account.current_limit

        fname, mimet, data = self.generate(
            document.load_data().dict[:limit],
            document.layout,
            document.background,
            document.get_variables(),
        )
        ftype = fname.split(".")[-1]
        resp = HttpResponse(data, content_type='application/octet-stream')
        resp['Content-Disposition'] = 'attachment; filename="{}.{}"'.format(
            document.name, ftype)
        return resp
def cache_generic_values(doc: Document,
                         save: bool = True,
                         log: ProcessLogger = None,
                         fire_doc_changed_event: bool = True):
    doc.generic_data = get_generic_values(doc)

    if save:
        doc.save(update_fields=['generic_data'])
        if fire_doc_changed_event:
            signals.fire_document_changed(sender=cache_generic_values,
                                          log=log,
                                          document=doc,
                                          system_fields_changed=False,
                                          generic_fields_changed=True,
                                          user_fields_changed=False,
                                          pre_detected_field_values=None)
Example #10
0
    def __init__(self, text: str, field_type: str):
        self.document = Document()
        self.field = DocumentField()
        self.field.type = field_type

        self.text_unit = TextUnit()
        self.text_unit.document = self.document
        self.text_unit.textunittext = TextUnitText()
        self.text_unit.textunittext.text = text
        self.text_unit.location_start = 1001
        self.text_unit.location_end = self.text_unit.location_start + len(text)

        self.detector = DocumentFieldDetector()
        self.detector.regexps_pre_process_lower = True
        self.detector.include_regexps = 'at\\s{1,5}least\\s{1,5}(two|2).{1,15}unaffiliated.{1,15}lenders\n' + \
            '(two|2).{1,30}lenders.{1,200}(not.{1,50}affiliate|affiliate.{1,100}(one|1|single))'
        self.detector.definition_words = 'required lenders\nrequired revolving lenders\n' + \
                                    'required revolving credit lenders\nrequired term lenders\n' + \
                                    'requisite lenders\nrequisite revolving lenders\n' + \
                                    'required class lenders\nrequired ddtl lenders'
        self.detector.detected_value = 'AFFILIATED'
        self.detector.text_part = TextParts.FULL.value
        self.detector.extraction_hint = ValueExtractionHint.TAKE_FIRST

        self.matcher = DetectorFieldMatcher(self.detector)
Example #11
0
    def process(self, **kwargs):
        self.log_info('Going to load document with fields...')

        document_name = kwargs.get('document_name')
        project = Project.objects.get(pk=kwargs.get('project_id'))  # type: Project
        run_detect_field_values = bool(kwargs.get('run_detect_field_values'))

        document_fields = kwargs.get('document_fields') or {}  # type: Dict
        if document_fields:
            document = Document(
                name=document_name,
                project=project,
                document_type=project.type,
                metadata={'parsed_by': None}
            )
            LoadDocumentWithFields.load_doc(self, document, document_fields, run_detect_field_values)

        path = kwargs['source_data']

        if path:
            self.log_info('Parse {0} at {1}'.format(path, file_access_handler))
            file_list = file_access_handler.list(path)

            self.log_info("Detected {0} files. Added {0} subtasks.".format(len(file_list)))

            if len(file_list) == 0:
                raise RuntimeError('Wrong file or directory name or directory is empty: {}'
                                   .format(path))
            load_docs_args = [(file_path, project.id, run_detect_field_values) for file_path in file_list]
            self.run_sub_tasks('Load Each Document',
                               LoadDocumentWithFields.create_document,
                               load_docs_args,
                               file_list)
Example #12
0
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :return:
    """

    save_cache = save
    save_detected = save
    if save and document.status and not document.status.is_active:
        log.info(
            'Forbidden storing detected field values for document with "completed"'
            ' status, document #{} ({})'.format(document.id, document.name))
        save_detected = False

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, f.get_depends_on_codes() or set())
                       for f in all_fields]
    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f
                                for f in all_fields
                                }  # type: Dict[str, DocumentField]

    field_values_pre_cached = False

    res = list()
    for field_code in sorted_codes:
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy
        if not field_values_pre_cached \
                and field_detection_strategy.uses_cached_document_field_values(field):
            # Pre-cache Document.field_values structure for the usage in field detection strategies
            document.field_values = field_value_cache.cache_field_values(
                document, None, save=False)
            field_values_pre_cached = True

        detected_values = field_detection_strategy.detect_field_values(
            log, document, field)  # type: List[DetectedFieldValue]
        if detected_values:
            res.extend(detected_values)
            if save_detected:
                save_detected_values(document, field, detected_values)

    if save_cache:
        field_value_cache.cache_field_values(document, res, save=True, log=log)

    return res
Example #13
0
def cache_generic_values(doc: Document, save: bool = True):
    document_qs = Document.objects.filter(pk=doc.pk) \
        .annotate(cluster_id=Max('documentcluster'),
                  parties=StringAgg('textunit__partyusage__party__name',
                                    delimiter=', ',
                                    distinct=True),
                  max_currency_amount=Max('textunit__currencyusage__amount'),
                  max_currency_name=Max('textunit__currencyusage__currency'),
                  min_date=Min('textunit__dateusage__date'),
                  max_date=Max('textunit__dateusage__date'))
    values = document_qs.values('cluster_id', 'parties', 'max_currency_amount',
                                'max_currency_name', 'min_date',
                                'max_date').first()
    doc.generic_data = values

    if save:
        doc.save(update_fields=['generic_data'])
Example #14
0
    def save_detected_values(document: Document, field: DocumentField,
                             field_type_adapter: FieldType,
                             detected_values: List[DetectedFieldValue],
                             do_not_write: bool):
        if len(detected_values) == 0:
            return 0

        try:
            if field.is_choice_field() and not field_type_adapter.multi_value:
                values_order = field.get_choice_values()
                for choice_value in values_order:
                    for dv in detected_values:
                        if choice_value == dv.value:
                            if not do_not_write:
                                field_type_adapter.save_value(
                                    document,
                                    field,
                                    dv.get_annotation_start(),
                                    dv.get_annotation_end(),
                                    dv.get_annotation_text(),
                                    dv.text_unit,
                                    dv.value,
                                    user=None,
                                    allow_overwriting_user_data=False,
                                    extraction_hint=dv.hint_name)
                            return 1
            else:
                for dv in detected_values:
                    if not do_not_write:
                        field_type_adapter.save_value(
                            document,
                            field,
                            dv.get_annotation_start(),
                            dv.get_annotation_end(),
                            dv.get_annotation_text(),
                            dv.text_unit,
                            dv.value,
                            user=None,
                            allow_overwriting_user_data=False,
                            extraction_hint=dv.hint_name)
                return len(detected_values)
        finally:
            document.cache_field_values()
Example #15
0
    def load_doc(task: ExtendedTask,
                 document: Document,
                 field_values_alias_to_value: Dict[str, Any],
                 run_detect_field_values: bool,
                 field_owners: Dict[str, User] = None):
        field_owners = field_owners if field_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value)
        log = CeleryTaskLogger(task)
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None})

            for field, value_dto in fields_to_values.items():
                field_repo.update_field_value_with_dto(document=document,
                                                       field=field,
                                                       field_value_dto=value_dto,
                                                       user=field_owners.get(field.code))

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log=log,
                                                                           document=document,
                                                                           save=True,
                                                                           clear_old_values=False)
            else:
                signals.fire_document_changed(sender=task,
                                              log=log,
                                              document=document,
                                              changed_by_user=None,
                                              document_initial_load=True,
                                              system_fields_changed=True,
                                              generic_fields_changed=True,
                                              user_fields_changed=True)

        task.log_info('Loaded {0} field values for document #{1} ({2}): {3}'
                      .format(len(fields_to_values),
                              document.pk,
                              document.name,
                              ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
Example #16
0
 def set_value_from_selection(self, doc: Document, value: str):
     doc.address = value
     g = geocoder.google(doc.address)
     if g.ok:
         doc.address_latitude = g.lat
         doc.address_longitude = g.lng
         doc.address_country = g.country_long
         doc.address_state_province = g.province_long
     elif g.status and 'ZERO' in g.status:
         # Google does not know such address - probably we detected it wrong.
         doc.address_state_province = None
         doc.address_country = None
         doc.address_longitude = None
         doc.address_latitude = None
     else:
         print('Unable to detect address via Google geocoder: {0}'.format(g.status))
     return doc.address
Example #17
0
    def filter_queryset(self, queryset) -> Any:
        queryset = queryset.filter(document__processed=True, document__delete_pending=False)

        # perm check - use only allowed docs
        user_id = self.extra_kwargs.get('user_id')
        if user_id:
            allowed_document_ids = Document.get_allowed_document_ids(user_id)
            queryset = queryset.filter(document_id__in=allowed_document_ids)

        if not self.project_id:
            return queryset
        project_ids = [self.project_id] if isinstance(self.project_id, (int, str)) else self.project_id or []
        return queryset.filter(document__project_id__in=project_ids)
Example #18
0
    def filter_queryset(self, queryset) -> Any:
        # TODO: this is copied from TextUnitFeatures, consider to inherit from 2 parents?
        queryset = queryset.filter(document__processed=True, document__delete_pending=False)

        # perm check - use only allowed docs
        user_id = self.extra_kwargs.get('user_id')
        if user_id:
            allowed_document_ids = Document.get_allowed_document_ids(user_id)
            queryset = queryset.filter(document_id__in=allowed_document_ids)

        if not self.project_id:
            return queryset
        project_ids = [self.project_id] if isinstance(self.project_id, (int, str)) else self.project_id or []
        return queryset.filter(document__project_id__in=project_ids)
Example #19
0
    def create_document(task: ExtendedTask, uri: str, project_id, run_detect_field_values):
        with file_access_handler.get_local_fn(uri) as (fn, file_name):
            task.task.title = 'Load Document: {0}'.format(uri)
            task.log_extra = {'log_document_name': uri}

            with open(fn, encoding='utf-8') as data_file:
                data = json.loads(data_file.read())
                project = Project.objects.get(pk=project_id)
                document_type = project.type
                document = Document(
                    name=file_name,
                    project=project,
                    document_type=document_type,
                    metadata={'parsed_by': None}
                )
                LoadDocumentWithFields.load_doc(task, document, data, run_detect_field_values)
Example #20
0
def detect_and_cache_field_values(log: ProcessLogger,
                                  doc: Document,
                                  field: DocumentField,
                                  save: bool = True) -> Optional[List[DetectedFieldValue]]:
    strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
        field.value_detection_strategy] \
        if field.value_detection_strategy else STRATEGY_DISABLED
    if strategy.uses_cached_document_field_values(field):
        # Pre-cache Document.field_values structure for the usage in field detection strategies
        doc.field_values = field_value_cache.cache_field_values(doc, None, save=False)
    detected_values = strategy.detect_field_values(log, doc, field)
    if save:
        save_detected_values(doc, field, detected_values)
        field_value_cache.cache_field_values(doc, detected_values,
                                             save=True,
                                             log=log)
    return detected_values
Example #21
0
    def create_document(task: ExtendedTask, uri: str, project_id, run_detect_field_values):
        file_storage = get_file_storage()
        with file_storage.get_document_as_local_fn(uri) as (fn, file_name):
            task.task.title = 'Load Document: {0}'.format(uri)
            task.log_extra = {'log_document_name': uri}

            with open(fn, encoding='utf-8') as data_file:
                data = json.loads(data_file.read())
                project = Project.objects.get(pk=project_id)
                document_type = project.type
                document = Document(
                    name=file_name,
                    project=project,
                    document_type=document_type,
                )
                LoadDocumentWithFields.load_doc(task=task,
                                                document=document,
                                                field_values_alias_to_value=data,
                                                run_detect_field_values=run_detect_field_values)
def _fill_system_fields_to_python_values(document: Document,
                                         field_to_python_values: Dict[str,
                                                                      List]):
    field_to_python_values[_FIELD_CODE_DOC_ID] = [document.id]
    field_to_python_values[_FIELD_CODE_DOC_NAME] = [document.name]
    field_to_python_values[_FIELD_CODE_DOC_TITLE] = [document.title]
    field_to_python_values[_FIELD_CODE_IS_REVIEWED] = [document.is_reviewed()]
    field_to_python_values[_FIELD_CODE_DOC_FULL_TEXT] = \
        [document.full_text[:settings.RAW_DB_FULL_TEXT_SEARCH_CUT_ABOVE_TEXT_LENGTH] if document.full_text else None]
    field_to_python_values[_FIELD_CODE_DOC_FULL_TEXT_LENGTH] = [
        len(document.full_text) if document.full_text else 0
    ]
    field_to_python_values[_FIELD_CODE_PROJECT_ID] = [document.project_id]
    field_to_python_values[_FIELD_CODE_ASSIGNEE_NAME] = [
        document.assignee.get_full_name() if document.assignee else None
    ]
    field_to_python_values[_FIELD_CODE_STATUS_NAME] = [
        document.status.name if document.status else None
    ]
Example #23
0
 def import_document(self, values: Dict[str, Any]):
     doc = Document()
     doc.name = values['name']
     doc.description = values['description']
     doc.source = values['source']
     doc.source_type = values['source_type']
     doc.paragraphs = values['paragraphs']
     doc.sentences = values['sentences']
     doc.title = values['title']
     doc.document_type_id = self.document_types[str(
         values['document_type_id'])]
     doc.project_id = self.project_ids[values['project_id']]
     doc.status_id = str(values['status_id'])
     doc.language = values['language']
     doc.file_size = values['file_size']
     if not pd.isnull(values['assign_date']):
         doc.assign_date = values['assign_date']
     doc.delete_pending = values['delete_pending'] == 't'
     doc.processed = values['processed'] == 't'
     doc.folder = values['folder']
     doc.document_class = values['document_class']
     doc.fields_dirty = values['fields_dirty']
     if not pd.isnull(values['assignee_id']):
         doc.assignee = self.target_user
     doc.source_path = values['source_path']
     doc.save()
     self.document_ids[values['id']] = doc.pk
     self.document_src_paths[doc.pk] = doc.source_path
     self.initially_loaded_docs.append(doc.pk)
def cache_field_values(doc: Document,
                       suggested_field_values: Optional[List[DetectedFieldValue]],
                       save: bool = True,
                       log: ProcessLogger = None,
                       changed_by_user: User = None,
                       system_fields_changed: bool = False,
                       generic_fields_changed: bool = False,
                       document_initial_load: bool = False) -> Dict[str, Any]:
    """
    Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document,
    converts them to the sortable DB-aware form and saves them to Document.f i eld_values.
    :param doc:
    :param save:
    :param suggested_field_values:
    :param log
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param document_initial_load
    :return:
    """
    document_type = doc.document_type  # type: DocumentType
    # TODO: get/save field value for specific field
    all_fields = list(document_type.fields.all())

    related_info_field_uids = {f.uid for f in all_fields if f.is_related_info_field()}

    fields_to_field_values = {f: None for f in all_fields}

    for fv in doc.documentfieldvalue_set.all():
        if fv.removed_by_user:
            continue

        field = fv.field
        field_type = fv.field.get_field_type()  # type: FieldType
        fields_to_field_values[field] = field_type \
            .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value)

    field_uids_to_field_values_db = {}

    for f in all_fields:  # type: DocumentField
        field_type = f.get_field_type()  # type: FieldType
        v = fields_to_field_values[f]
        field_uids_to_field_values_db[f.uid] = field_type.merged_python_value_to_db(v)

    if suggested_field_values:
        field_codes_to_suggested_values = \
            merge_detected_field_values_to_python_value(suggested_field_values)  # type: Dict[str, Any]
    else:
        field_codes_to_suggested_values = None

    for f in all_fields:  # type: DocumentField
        field_type = f.get_field_type()  # type: FieldType
        if f.is_detectable():
            suggested_field_uid = Document.get_suggested_field_uid(f.uid)
            if field_codes_to_suggested_values:
                suggested_value_db = field_type.merged_python_value_to_db(field_codes_to_suggested_values.get(f.code))
            else:
                suggested_value_db = field_uids_to_field_values_db.get(suggested_field_uid)

            # suggested_value_db can be list, None or int, Iterable validation should be here
            if isinstance(suggested_value_db, Iterable) and f.is_related_info_field():
                suggested_value_db = len(suggested_value_db)
            field_uids_to_field_values_db[suggested_field_uid] = suggested_value_db

    if save:
        signals.fire_document_changed(sender=cache_field_values,
                                      changed_by_user=changed_by_user,
                                      log=log,
                                      document=doc,
                                      system_fields_changed=system_fields_changed,
                                      generic_fields_changed=generic_fields_changed,
                                      user_fields_changed=True,
                                      pre_detected_field_values=field_codes_to_suggested_values,
                                      document_initial_load=document_initial_load)

    return field_uids_to_field_values_db
Example #25
0
    def set_value_from_selection(self, doc: Document, value: str):
        if self.field_type == FieldType.FIELD_TYPE_CONCRETE_STRING:
            doc.__setattr__(self.field, value)

        elif self.field_type == FieldType.FIELD_TYPE_CONCRETE_FLOAT:
            try:
                doc.__setattr__(self.field, float(value))
            except ValueError:
                nums = list(extractors.find_numbers(value)) if value else None
                doc.__setattr__(self.field, nums[0] if nums else None)

        elif self.field_type == FieldType.FIELD_TYPE_CONCRETE_INTEGER:
            try:
                doc.__setattr__(self.field, int(value))
            except ValueError:
                nums = list(extractors.find_numbers(value)) if value else None
                doc.__setattr__(self.field, nums[0] if nums else None)

        elif self.field_type == FieldType.FIELD_TYPE_CONCRETE_DATE:
            d = dateparser.parse(value) if value else None
            if d:
                doc.__setattr__(self.field, d)
            else:
                dates = list(get_dates(value)) if value else None
                doc.__setattr__(self.field, dates[0] if dates else None)

        return doc.__getattribute__(self.field)
 def setup_document(self) -> Document:
     doc = Document()
     return doc
Example #27
0
def cache_field_values(doc: Document,
                       suggested_field_values: Optional[
                           List[DetectedFieldValue]],
                       save: bool = True,
                       log: ProcessLogger = None) -> Dict[str, Any]:
    """
    Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document,
    converts them to the sortable DB-aware form and saves them to Document.field_values.
    :param doc:
    :param save:
    :param suggested_field_values:
    :param log
    :return:
    """
    document_type = doc.document_type  # type: DocumentType
    # TODO: get/save field value for specific field
    all_fields = list(document_type.fields.all())

    related_info_field_uids = {
        f.uid
        for f in all_fields if f.is_related_info_field()
    }

    fields_to_field_values = {f: None for f in all_fields}

    for fv in doc.documentfieldvalue_set.all():
        if fv.removed_by_user:
            continue

        field = fv.field
        field_type = FIELD_TYPES_REGISTRY[fv.field.type]  # type: FieldType
        fields_to_field_values[field] = field_type \
            .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value)

    field_uids_to_field_values_db = {}

    for f in all_fields:  # type: DocumentField
        field_type = FIELD_TYPES_REGISTRY[f.type]  # type: FieldType
        v = fields_to_field_values[f]
        field_uids_to_field_values_db[
            f.uid] = field_type.merged_python_value_to_db(v)

    if suggested_field_values:
        field_codes_to_suggested_values = \
            merge_detected_field_values_to_python_value(suggested_field_values)  # type: Dict[str, Any]
    else:
        field_codes_to_suggested_values = None

    for f in all_fields:  # type: DocumentField
        field_type = f.get_field_type()  # type: FieldType
        if f.is_detectable():
            suggested_field_uid = Document.get_suggested_field_uid(f.uid)
            if field_codes_to_suggested_values:
                suggested_value_db = field_type.merged_python_value_to_db(
                    field_codes_to_suggested_values.get(f.code))
            else:
                suggested_value_db = doc.field_values.get(
                    suggested_field_uid) if doc.field_values else None

            field_uids_to_field_values_db[
                suggested_field_uid] = suggested_value_db

    if save:
        doc.field_values = {
            uid: len(value)
            if uid in related_info_field_uids and value is not None else value
            for uid, value in field_uids_to_field_values_db.items()
        }
        doc.save()
        events.on_document_change(
            events.DocumentChangedEvent(
                log=log,
                document=doc,
                system_fields_changed=False,
                generic_fields_changed=False,
                user_fields_changed=True,
                pre_detected_field_values=field_codes_to_suggested_values))

    return field_uids_to_field_values_db
 def set_value_from_selection(self, doc: Document, value: str):
     doc.address = value
     return doc.address
Example #29
0
 def get_value(self, doc: Document):
     return doc.__getattribute__(self.field)
Example #30
0
def detect_and_cache_field_values_for_document(
        log: ProcessLogger,
        document: Document,
        save: bool = True,
        clear_old_values: bool = True,
        changed_by_user: User = None,
        system_fields_changed: bool = False,
        generic_fields_changed: bool = False,
        document_initial_load: bool = False,
        ignore_field_codes: Set[str] = None):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param document_initial_load
    :return:
    """

    save_cache = save
    save_detected = save
    if save and document.status and not document.status.is_active:
        log.info(
            'Forbidden storing detected field values for document with "completed"'
            ' status, document #{} ({})'.format(document.id, document.name))
        save_detected = False

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, f.get_depends_on_codes() or set())
                       for f in all_fields]
    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f
                                for f in all_fields
                                }  # type: Dict[str, DocumentField]

    field_values_pre_cached = False

    res = list()
    for field_code in sorted_codes:
        if ignore_field_codes and field_code in ignore_field_codes:
            continue

        field = all_fields_code_to_field[field_code]  # type: DocumentField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy
        if not field_values_pre_cached \
                and field_detection_strategy.uses_cached_document_field_values(field):
            # Pre-cache Document.field_values structure for the usage in field detection strategies
            document.field_values = field_value_cache.cache_field_values(
                document, None, save=False)
            field_values_pre_cached = True

        try:
            detected_values = field_detection_strategy.detect_field_values(
                log, document, field)  # type: List[DetectedFieldValue]
        except Exception as e:
            msg = '''Unable to detect field value. 
            Document type: {0} 
            Document: {1} 
            Field: {2}'''.format(document_type.code, document.pk, field.code)
            log.error(render_error(msg, e))
            raise e

        if save_detected and clear_old_values:
            # Delete previously detected values
            # to avoid accumulating garbage on each iteration.
            DocumentFieldValue.objects \
                .filter(document=document,
                        field=field,
                        removed_by_user=False,
                        created_by__isnull=True,
                        modified_by__isnull=True) \
                .delete()

        if detected_values:
            res.extend(detected_values)
            if save_detected:
                save_detected_values(document, field, detected_values)

    if save_cache:
        field_value_cache.cache_field_values(
            document,
            suggested_field_values=res,
            save=True,
            log=log,
            changed_by_user=changed_by_user,
            system_fields_changed=system_fields_changed,
            generic_fields_changed=generic_fields_changed,
            document_initial_load=document_initial_load)

    return res