Ejemplo n.º 1
0
    def process(self, **kwargs):
        document_field_arg = kwargs.get('document_field')
        document_field_id = document_field_arg['pk'] if document_field_arg else None

        delete_broken = kwargs.get('delete_broken')

        # check FieldValue-s
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        qs_field_values = field_repo.get_field_value_ids_by_doc_field(document_field_id)
        total_num = qs_field_values.count()
        for ids_chunk in chunks(qs_field_values.values_list('pk', flat=True), 100):
            id_list = list(ids_chunk)
            self.run_sub_tasks('Check FieldValues',
                               self.check_field_values,
                               [(id_list, delete_broken)])
            self.log_info(f'Sub-tasks started for {len(id_list)} FieldValues of total {total_num}')

        # check FieldAnnotation-s
        qs_ants = field_repo.get_fieldant_ids_by_doc_field(document_field_id)
        total_num = qs_ants.count()
        for ids_chunk in chunks(qs_ants.values_list('pk', flat=True), 100):
            id_list = list(ids_chunk)
            self.run_sub_tasks('Check FieldAnnotations',
                               self.check_annotations,
                               [(id_list, delete_broken)])
            self.log_info(f'Sub-tasks started for {len(id_list)} FieldAnnotations of total {total_num}')
Ejemplo n.º 2
0
    def process(self, **kwargs):
        ant_uids = kwargs.get('ids')
        status_id = kwargs.get('status_id')

        # for preventing "connection already closed"
        TaskUtils.prepare_task_execution()
        ann_status = FieldAnnotationStatus.objects.get(pk=status_id)
        user = User.objects.get(pk=kwargs.get('user_id'))

        true_annotations = FieldAnnotation.objects.filter(uid__in=ant_uids)
        false_annotations = FieldAnnotationFalseMatch.objects.filter(uid__in=ant_uids)

        if ann_status.is_rejected:
            from apps.document.repository.document_field_repository import DocumentFieldRepository
            field_repo = DocumentFieldRepository()
            for ant in true_annotations:
                field_repo.delete_field_annotation_and_update_field_value(ant, user)
        else:
            import apps.document.repository.document_field_repository as dfr
            field_repo = dfr.DocumentFieldRepository()
            field_repo.update_field_annotations_by_ant_ids(
                ant_uids, [(f'{FIELD_CODE_STATUS_ID}', status_id)])

            if false_annotations:
                for false_ant in false_annotations:
                    field_repo.restore_field_annotation_and_update_field_value(
                        false_ant, status_id, user)

        ant_docs = set(FieldAnnotation.objects.filter(
            uid__in=ant_uids).values_list('document_id', flat=True))
        false_ant_docs = set(FieldAnnotationFalseMatch.objects.filter(
            uid__in=ant_uids).values_list('document_id', flat=True))
        ant_docs.update(false_ant_docs)
        Document.reset_status_from_annotations(ann_status=ann_status,
                                               document_ids=list(ant_docs))
    def _validate_critical_properties_changed(self, context: dict) -> None:
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        saved_field = self._get_saved_field(context)
        if not saved_field:
            return
        err_msg = ''
        new_field_type = self.object.type
        old_document_type_pk = self.to_str_if_uuid(self._get_document_type_pk(saved_field))
        new_document_type_pk = self.to_str_if_uuid(self.document_type_pk)
        old_field_type = saved_field.type
        if old_document_type_pk != new_document_type_pk:
            err_msg += f'Document type has changed, old document type id is #{old_document_type_pk}' + \
                       f', new document type id is #{self.document_type_pk}. '
        if old_field_type != new_field_type:
            err_msg += f'Field type has changed, old field type is ' + \
                       f'"{self._get_field_type_title(old_field_type)}", ' + \
                       f'new field type is "{self._get_field_type_title(new_field_type)}". '
        if err_msg:
            err_msg = f'Unable to update field #{self.pk} "{self.object.code}". {err_msg}'
            values_count = field_repo.get_count_by_field(self.object.pk)
            user_values_count = 0
            detected_values_count = 0
            if values_count > 0:
                user_values_count = field_repo.get_doc_field_values_filtered_count(self.object.pk)
                detected_values_count = self._get_detected_values_count(values_count, user_values_count)
            err_msg += 'Existing document field values become invalid and will be removed. ' + \
                       f'User entered values {user_values_count}, ' + \
                       f'automatically detected values {detected_values_count}. ' + \
                       'You need to set force auto-fixes option to continue ' + \
                       '(this option will remove all values for this field) or make manual updates.'

            raise ValidationError(err_msg)
Ejemplo n.º 4
0
 def check_field_values(task: ExtendedTask,
                        field_value_ids: List[int],
                        delete_broken: bool = False):
     import apps.document.repository.document_field_repository as dfr
     field_repo = dfr.DocumentFieldRepository()
     qa_field_values = field_repo.get_field_values_by_ids(field_value_ids)
     for fv in qa_field_values:  # FieldAnnotation
         field = fv.field  # type: DocumentField
         if not TypedField.by(field).is_json_field_value_ok(fv.value):
             FindBrokenDocumentFieldValues.process_broken(task, fv, delete_broken)
Ejemplo n.º 5
0
 def delete_document_history_by_ids(self, ids: List[int]) -> None:
     import apps.document.repository.document_field_repository as dfr
     field_repo = dfr.DocumentFieldRepository()
     field_repo.delete_documents_history_values(ids)
     TextUnitNote.history.filter(text_unit__document_id__in=ids)
     DocumentNote.history.filter(document_id__in=ids).delete()
     Document.history.filter(id__in=ids).delete()
     # delete empty Party and Clusters
     Party.objects.filter(partyusage__isnull=True).delete()
     DocumentCluster.objects.filter(documents__isnull=True).delete()
     TextUnitCluster.objects.filter(text_units__isnull=True).delete()
Ejemplo n.º 6
0
    def load_doc(task: ExtendedTask,
                 document: Document,
                 field_values_alias_to_value: Dict[str, Any],
                 run_detect_field_values: bool,
                 field_owners: Dict[str, User] = None):
        field_owners = field_owners if field_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value)
        log = CeleryTaskLogger(task)
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None})

            for field, value_dto in fields_to_values.items():
                field_repo.update_field_value_with_dto(document=document,
                                                       field=field,
                                                       field_value_dto=value_dto,
                                                       user=field_owners.get(field.code))

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log=log,
                                                                           document=document,
                                                                           save=True,
                                                                           clear_old_values=False)
            else:
                signals.fire_document_changed(sender=task,
                                              log=log,
                                              document=document,
                                              changed_by_user=None,
                                              document_initial_load=True,
                                              system_fields_changed=True,
                                              generic_fields_changed=True,
                                              user_fields_changed=True)

        task.log_info('Loaded {0} field values for document #{1} ({2}): {3}'
                      .format(len(fields_to_values),
                              document.pk,
                              document.name,
                              ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
Ejemplo n.º 7
0
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True,
                                               clear_old_values: bool = True,
                                               changed_by_user: User = None,
                                               system_fields_changed: bool = False,
                                               generic_fields_changed: bool = False,
                                               document_initial_load: bool = False,
                                               ignore_field_codes: Set[str] = None,
                                               updated_field_codes: List[str] = None,
                                               skip_modified_values: bool = True):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param ignore_field_codes
    :param document_initial_load
    :param updated_field_codes - if set, we search for changed and dependent fields only
    :param skip_modified_values - don't overwrite field values overwritten by user
    :return:
    """
    import apps.document.repository.document_field_repository as dfr
    field_repo = dfr.DocumentFieldRepository()

    if save and document.status and not document.status.is_active:
        raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n'
                           f'Document: {document.name} (#{document.pk})')

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields]
    dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \
        if updated_field_codes else None

    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f for f in all_fields}  # type: Dict[str, DocumentField]

    log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n'
             f'Updated fields: {updated_field_codes or "All"}.\n'
             f'Dependent fields to be detected: {dependent_fields or "All"}.\n'
             f'Ignored fields: {ignore_field_codes}.')

    if updated_field_codes:
        sorted_codes = [c for c in sorted_codes
                        if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)]
    elif ignore_field_codes:
        sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes]

    current_field_values = {f.code: None for f in all_fields}
    # we may get values for fields required for sorted_codes, regarding
    # further dependencies
    # or we may just get all fields' values (field_codes_only=None)
    actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk,
                                                                    doc_id=document.pk,
                                                                    field_codes_only=None)
    current_field_values.update(actual_field_values)

    res = list()

    detecting_field_status = []  # type:List[str]
    detection_errors = []  # type:List[Tuple[str, str, Exception, Any]]

    # do not touch field values modified by user
    skip_codes = set()
    if skip_modified_values:
        skip_codes = set(list(FieldValue.objects.filter(
            modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True)))
        if updated_field_codes:  # these fields have to be deleted despite being set by user
            # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True)
            skip_codes -= set(updated_field_codes)

    if clear_old_values:
        field_repo.delete_document_field_values(document.pk,
                                                list(skip_codes),
                                                updated_field_codes)

    for field_code in sorted_codes:
        if field_code in skip_codes:
            continue
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        typed_field = TypedField.by(field)  # type: TypedField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy

        try:
            new_field_value_dto = field_detection_strategy.detect_field_value(log=log,
                                                                              doc=document,
                                                                              field=field,
                                                                              field_code_to_value=current_field_values)

            if not new_field_value_dto:
                detecting_field_status.append(f"No new value's gotten for '{field.code}'")
                continue
            if is_unit_limit_exceeded(new_field_value_dto, field, document):
                continue

            detecting_field_status.append(
                f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'")

            # now merge the detection results with the current DB state
            if save:
                # user = None here to store detected values as owned by system allowing further overwriting
                field_value, annotations = field_repo.update_field_value_with_dto(document=document,
                                                                                  field=field,
                                                                                  field_value_dto=new_field_value_dto,
                                                                                  user=None)

                # and update the field value of this field which may be used for detection of fields depending on it
                current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value)

            # If save is not requested then do not update current_field_values.
            # Most likely in this case we detect only few requested fields and trying to comply the dependency
            # tree makes no big sense.
        except Exception as e:
            # Additionally logging here because the further compound exception will not contain the full stack trace.
            log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})',
                      exc_info=e)
            detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info()))

    if save:
        if updated_field_codes:
            user_fields_changed_set = set(updated_field_codes)
            if dependent_fields:
                user_fields_changed_set.update(dependent_fields)
            user_fields_changed = list(user_fields_changed_set)  # type: FieldSpec
        else:
            user_fields_changed = True

        fire_document_changed(sender=detect_and_cache_field_values_for_document,
                              log=log,
                              document=document,
                              changed_by_user=changed_by_user,
                              document_initial_load=document_initial_load,
                              system_fields_changed=system_fields_changed,
                              generic_fields_changed=generic_fields_changed,
                              user_fields_changed=user_fields_changed)
        if dependent_fields:
            msg = f'Recalculating dependent fields for {document.name}: '  # dependent_fields
            msg += ', '.join(dependent_fields)
            msg += '.\n\nSource fields data: \n'
            msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"'
                              for k in current_field_values])
            msg += '.\n\nCalculation results:\n'
            msg += '\n'.join(detecting_field_status)
            log.info(msg)

    if detection_errors:
        fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors])
        msg = f'There were errors while detecting fields:\n{fields_str}\n' + \
              f'for document {document.name} (#{document.pk}, type {document_type.code})\n'
        for f_code, f_type, ex, ex_stack in detection_errors:
            msg += f'\n{f_code}, {f_type}: {ex}'
        raise FieldDetectionError(msg)

    return res
def get_field_values_dump() -> str:
    import apps.document.repository.document_field_repository as dfr
    field_repo = dfr.DocumentFieldRepository()
    data = field_repo.get_annotated_values_for_dump()
    transfer_objects = [ExternalFieldValue(**i) for i in data]
    return core_serializers.serialize('json', transfer_objects)
Ejemplo n.º 9
0
    def process(self, **kwargs):
        dst_field = kwargs['field']
        dst_field = DocumentField.objects.filter(pk=dst_field['pk']) \
            .prefetch_related('depends_on_fields') \
            .select_related(DST_FIELD_SIMILARITY_CONFIG_ATTR) \
            .first()  # type: DocumentField

        if not dst_field:
            raise RuntimeError('Document field not found: {0}'.format(kwargs['field']))

        proj = kwargs['project']
        proj_id = proj['pk'] if proj else None  # type:Optional[int]
        doc_query = Document.objects.filter(document_type=dst_field.document_type,
                                            project_id=proj_id) if proj_id \
            else Document.objects.filter(document_type=dst_field.document_type)

        config = getattr(dst_field, DST_FIELD_SIMILARITY_CONFIG_ATTR)  # type: DocumentSimilarityConfig

        config.self_validate()

        similarity_threshold = config.similarity_threshold
        feature_vector_fields = list(dst_field.depends_on_fields.all())
        feature_vector_field_codes = [f.code for f in feature_vector_fields]

        self.log_info('{field}: Min similarity: {threshold}'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()
        qr_doc_ids = doc_query.values_list('pk', flat=True)
        doc_ids_to_code_to_value = field_repo \
            .get_field_code_to_python_value_multiple_docs(document_type_id=dst_field.document_type_id,
                                                          doc_ids=qr_doc_ids,
                                                          field_codes_only=feature_vector_field_codes)

        field_values_list = list()
        for doc_id, values in doc_ids_to_code_to_value:
            values[FIELD_CODE_DOC_ID] = doc_id
            field_values_list.append(values)

        total_docs = len(field_values_list)

        self.set_push_steps(int(5 + total_docs / 100))

        self.push()
        self.log_info(
            '{field}: Building feature vectors for {n} documents'.format(field=dst_field.code, n=total_docs))

        vectorizer = document_feature_vector_pipeline(feature_vector_fields, use_field_codes=True)
        feature_vectors = vectorizer.fit_transform(field_values_list)

        self.push()
        self.log_info('{field}: Finding similar documents (similarity >= {threshold})'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        doc_ids_to_values = defaultdict(set)
        for x, doc_a_field_values in enumerate(field_values_list):
            doc_a_pk = doc_a_field_values[FIELD_CODE_DOC_ID]
            similarities = cosine_similarity(feature_vectors[x], feature_vectors)
            for y, doc_b_field_values in enumerate(field_values_list):
                doc_b_pk = doc_b_field_values[FIELD_CODE_DOC_ID]
                if doc_a_pk == doc_b_pk:
                    continue
                similarity = similarities[0, y]
                if similarity < similarity_threshold:
                    continue
                doc_ids_to_values[doc_a_pk].add(doc_b_pk)
                doc_ids_to_values[doc_b_pk].add(doc_a_pk)
            if x % 100 == 0:
                self.log_info('{field}: Checked for similarity {x} documents of {n}'
                              .format(field=dst_field.code, x=x + 1, n=total_docs))
                self.push()

        self.push()
        self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.'
                      .format(field=dst_field.code, n=len(doc_ids_to_values)))

        doc_ids_to_values = {doc_id: list(v) if v else None for doc_id, v in doc_ids_to_values}
        field_repo.store_values_one_field_many_docs_no_ants(field=dst_field, doc_ids_to_values=doc_ids_to_values)

        log = CeleryTaskLogger(self)
        for doc_id in doc_ids_to_values.keys():
            try:
                doc = Document.objects.get(pk=doc_id)
                signals.fire_document_changed(log=log, document=doc, changed_by_user=None, system_fields_changed=False,
                                              generic_fields_changed=False, user_fields_changed=[dst_field.code])
            except Exception as ex:
                self.log_error(f'Unable to fire doc id change event for doc #{doc_id}', exc_info=ex)
    def process(self,
                document_type: DocumentType = None,
                project_ids=list,
                document_name: str = None,
                do_not_run_for_modified_documents=True,
                do_not_write=False,
                **kwargs):
        self.log_info(
            "Going to detect document field values based on "
            "the pre-coded regexps and field values entered by users...")

        if isinstance(document_type, dict):
            document_type = DocumentType.objects.get(pk=document_type['pk'])

        # reindex document grid fields cache after detecting fields
        from apps.rawdb.tasks import auto_reindex_not_tracked
        doc_type_code = document_type.code \
            if document_type and hasattr(document_type, 'code') else None
        call_task_func(auto_reindex_not_tracked, (doc_type_code, ),
                       None,
                       queue=settings.CELERY_QUEUE_SERIAL,
                       run_after_sub_tasks_finished=True,
                       main_task_id=self.request.id)

        document_id = kwargs.get('document_id')
        if document_id:
            self.set_push_steps(1)
            dcptrs = DocDetectFieldValuesParams(document_id, False, True)
            self.run_sub_tasks(
                'Detect Field Values For Single Document',
                DetectFieldValues.detect_field_values_for_document,
                [(dcptrs, )])
            self.push()
            return

        task_count = 0
        document_types = [document_type
                          ] if document_type else DocumentType.objects.all()
        document_type_pks = []
        for document_type in document_types:
            if document_type.pk and document_type.fields.exists():
                document_type_pks.append(document_type.pk)
            else:
                self.log_info(
                    'Can not find any fields assigned to document type: {0}'.
                    format(document_type))

        detect_field_values_for_document_args = []
        source_data = []

        qs = Document.objects.filter(status__is_active=True)
        if document_name:
            qs = qs.filter(name=document_name)
        elif document_id:
            qs = qs.filter(pk=document_id)
        elif project_ids:
            qs = qs.filter(project_id__in=project_ids)
        elif document_type_pks:
            qs = qs.filter(document_type_id__in=document_type_pks)

        # filter out modified documents
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        if do_not_run_for_modified_documents:
            modified_document_ids = field_repo.get_removed_fieldvals_doc_ids()
            qs = qs.exclude(pk__in=Subquery(modified_document_ids))

        for doc_id, source, name in qs.values_list('id', 'source', 'name'):
            dcptrs = DocDetectFieldValuesParams(doc_id, do_not_write, True)
            detect_field_values_for_document_args.append((dcptrs, ))
            if source:
                source_data.append('{0}/{1}'.format(source, name))
            else:
                source_data.append(name)
            task_count += 1

        self.run_sub_tasks('Detect Field Values For Each Document',
                           DetectFieldValues.detect_field_values_for_document,
                           detect_field_values_for_document_args, source_data)
        if task_count > 0:
            self.log_info('Found {0} documents'.format(task_count))
        else:
            self.log_info('No documents found')
Ejemplo n.º 11
0
    def test_field_detector_model(task: ExtendedTask, field_id, document_id) -> dict:
        document = Document.objects.get(pk=document_id)  # type: Document
        field = DocumentField.objects.get(pk=field_id)  # type: DocumentField
        typed_field = TypedField.by(field)

        expected_field_value_dto = field_detection.detect_field_value(
            CeleryTaskLogger(task), document, field)  # type: FieldValueDTO

        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        if typed_field.requires_value:
            # dates, numbers, e.t.c.
            actual_field_value_dict = field_repo \
                .get_field_code_to_python_value(document_type_id=document.document_type_id,
                                                doc_id=document_id,
                                                field_codes_only={field.code})

            actual_field_value = actual_field_value_dict.get(field.code) if actual_field_value_dict else None
            expected_field_value = expected_field_value_dto.field_value if expected_field_value_dto else None

            matches = bool(expected_field_value == actual_field_value)
        else:
            expected_set = set()
            # related-info e.t.c. - comparing by annotations - exact comparing
            if expected_field_value_dto.annotations:
                for ant_dto in expected_field_value_dto.annotations:
                    text_unit_id = field_repo.find_text_unit_id_by_location(
                        document,
                        field,
                        ant_dto.location_in_doc_start,
                        ant_dto.location_in_doc_end)
                    if not text_unit_id:
                        continue
                    expected_set.add('text_unit_' + str(text_unit_id))
            expected_field_value_dto = '; '.join(sorted(expected_set))

            actual_dfvs = FieldAnnotation.objects.filter(
                document_type_id=document.document_type_id,
                doc_id=document_id,
                field_id=field.pk)
            actual_set = {'text_unit_' + str(dfv.text_unit.id) for dfv in actual_dfvs if dfv.text_unit}
            actual_field_value = '; '.join(sorted(actual_set))
            matches = bool(expected_set == actual_set)

        if not matches:
            found_in_text = [dfv.text_unit.text
                             for dfv in expected_field_value_dto
                             if dfv.text_unit and dfv.text_unit.text] if expected_field_value_dto else []
            found_in_text_msg = ''
            if found_in_text:
                found_in_text_msg = '\nDetected in text:\n-----\n{0}\n-----'.format('\n---\n'.join(found_in_text))
            task.log_info('{3} Test doc: {0} (Doc id: {6}, Project: {5}). '
                          'Detected: {1}. Real: {2}.{4}'
                          .format(document.name,
                                  expected_field_value_dto,
                                  actual_field_value,
                                  '[  OK  ]' if matches else '[ ERR  ]',
                                  found_in_text_msg,
                                  document.project.name if document.project else '',
                                  document.pk))

        text_units_number = TextUnit.objects.filter(document=document, unit_type=field.text_unit_type).count()

        return {
            'text_units_number': text_units_number,
            'value_matches_expected': matches,
            'actual_field_value': actual_field_value if typed_field.is_choice_field else None
        }