def process(self, **kwargs):

        n_clusters = kwargs.get('n_clusters', 3)
        method = kwargs.get('method', 'kmeans')
        cluster_by = kwargs.get('cluster_by', 'term')

        self.project_clustering_id = kwargs.get('project_clustering_id')
        project_clustering = ProjectClustering.objects.get(pk=self.project_clustering_id)
        project_clustering.status = PENDING
        project_clustering.task = self.task
        project_clustering.save()

        project = project_clustering.project

        self.log_info('Start clustering documents for project id={}'.format(project.id))
        self.log_info('Clustering method: "{}", n_clusters={}'.format(method, n_clusters))
        self.log_info('Cluster by: {}'.format(str(cluster_by)))

        self.set_push_steps(4)

        self.push()

        # clear previous clusters, their tasks and cluster sessions
        project.drop_clusters(exclude_task_ids={self.request.id},
                              exclude_project_clustering_id=self.project_clustering_id)
        self.push()

        cluster_model = ClusterDocuments(project_id=project.id,
                                         cluster_algorithm=method,
                                         n_clusters=n_clusters,
                                         cluster_by=cluster_by,
                                         use_default_name=True,
                                         log_message=self.log_wo_flooding)
        result = cluster_model.run()

        project_clustering.metadata = result.metadata
        project_clustering.save()
        project_clustering.document_clusters.add(*result.metadata['cluster_obj_ids'])

        self.push()
        self.log_info('Clustering completed. Updating document cache.')

        log = CeleryTaskLogger(self)
        for doc in Document.objects.filter(project__pk=project.id):
            signals.fire_document_changed(sender=self,
                                          log=log,
                                          document=doc,
                                          changed_by_user=None,
                                          system_fields_changed=False,
                                          user_fields_changed=False,
                                          generic_fields_changed=[DocumentGenericField.cluster_id.value])

        project_clustering.status = SUCCESS
        project_clustering.save()

        self.push()
        self.log_info('Finished.')
        return result.metadata
def cache_generic_values(doc: Document, save: bool = True,
                         log: ProcessLogger = None,
                         fire_doc_changed_event: bool = True):
    if save:
        if fire_doc_changed_event:
            signals.fire_document_changed(sender=cache_generic_values,
                                          log=log,
                                          document=doc,
                                          system_fields_changed=False,
                                          generic_fields_changed=True,
                                          user_fields_changed=False,
                                          pre_detected_field_values=None)
Exemple #3
0
    def load_doc(task: ExtendedTask,
                 document: Document,
                 field_values_alias_to_value: Dict[str, Any],
                 run_detect_field_values: bool,
                 field_owners: Dict[str, User] = None):
        field_owners = field_owners if field_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value)
        log = CeleryTaskLogger(task)
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None})

            for field, value_dto in fields_to_values.items():
                field_repo.update_field_value_with_dto(document=document,
                                                       field=field,
                                                       field_value_dto=value_dto,
                                                       user=field_owners.get(field.code))

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log=log,
                                                                           document=document,
                                                                           save=True,
                                                                           clear_old_values=False)
            else:
                signals.fire_document_changed(sender=task,
                                              log=log,
                                              document=document,
                                              changed_by_user=None,
                                              document_initial_load=True,
                                              system_fields_changed=True,
                                              generic_fields_changed=True,
                                              user_fields_changed=True)

        task.log_info('Loaded {0} field values for document #{1} ({2}): {3}'
                      .format(len(fields_to_values),
                              document.pk,
                              document.name,
                              ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
Exemple #4
0
def process_document_changed(task: ExtendedTask,
                             doc_id: int,
                             system_fields_changed: FieldSpec = True,
                             generic_fields_changed: FieldSpec = True,
                             user_fields_changed: bool = True,
                             changed_by_user_id: int = None):
    from apps.document.repository.document_field_repository import DocumentFieldRepository

    dfr = DocumentFieldRepository()

    doc = Document.objects.get(pk=doc_id)  # type: Document
    changed_by_user = User.objects.get(pk=changed_by_user_id) if changed_by_user_id is not None else None
    if DocumentSystemField.status.specified_in(system_fields_changed):
        dfr.delete_hidden_field_values_if_needed(doc, event_sender=task)
    fire_document_changed(sender=task,
                          log=CeleryTaskLogger(task),
                          document=doc,
                          changed_by_user=changed_by_user,
                          document_initial_load=False,
                          system_fields_changed=system_fields_changed,
                          generic_fields_changed=generic_fields_changed,
                          user_fields_changed=user_fields_changed)
Exemple #5
0
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True,
                                               clear_old_values: bool = True,
                                               changed_by_user: User = None,
                                               system_fields_changed: bool = False,
                                               generic_fields_changed: bool = False,
                                               document_initial_load: bool = False,
                                               ignore_field_codes: Set[str] = None,
                                               updated_field_codes: List[str] = None,
                                               skip_modified_values: bool = True):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param ignore_field_codes
    :param document_initial_load
    :param updated_field_codes - if set, we search for changed and dependent fields only
    :param skip_modified_values - don't overwrite field values overwritten by user
    :return:
    """
    import apps.document.repository.document_field_repository as dfr
    field_repo = dfr.DocumentFieldRepository()

    if save and document.status and not document.status.is_active:
        raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n'
                           f'Document: {document.name} (#{document.pk})')

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields]
    dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \
        if updated_field_codes else None

    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f for f in all_fields}  # type: Dict[str, DocumentField]

    log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n'
             f'Updated fields: {updated_field_codes or "All"}.\n'
             f'Dependent fields to be detected: {dependent_fields or "All"}.\n'
             f'Ignored fields: {ignore_field_codes}.')

    if updated_field_codes:
        sorted_codes = [c for c in sorted_codes
                        if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)]
    elif ignore_field_codes:
        sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes]

    current_field_values = {f.code: None for f in all_fields}
    # we may get values for fields required for sorted_codes, regarding
    # further dependencies
    # or we may just get all fields' values (field_codes_only=None)
    actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk,
                                                                    doc_id=document.pk,
                                                                    field_codes_only=None)
    current_field_values.update(actual_field_values)

    res = list()

    detecting_field_status = []  # type:List[str]
    detection_errors = []  # type:List[Tuple[str, str, Exception, Any]]

    # do not touch field values modified by user
    skip_codes = set()
    if skip_modified_values:
        skip_codes = set(list(FieldValue.objects.filter(
            modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True)))
        if updated_field_codes:  # these fields have to be deleted despite being set by user
            # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True)
            skip_codes -= set(updated_field_codes)

    if clear_old_values:
        field_repo.delete_document_field_values(document.pk,
                                                list(skip_codes),
                                                updated_field_codes)

    for field_code in sorted_codes:
        if field_code in skip_codes:
            continue
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        typed_field = TypedField.by(field)  # type: TypedField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy

        try:
            new_field_value_dto = field_detection_strategy.detect_field_value(log=log,
                                                                              doc=document,
                                                                              field=field,
                                                                              field_code_to_value=current_field_values)

            if not new_field_value_dto:
                detecting_field_status.append(f"No new value's gotten for '{field.code}'")
                continue
            if is_unit_limit_exceeded(new_field_value_dto, field, document):
                continue

            detecting_field_status.append(
                f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'")

            # now merge the detection results with the current DB state
            if save:
                # user = None here to store detected values as owned by system allowing further overwriting
                field_value, annotations = field_repo.update_field_value_with_dto(document=document,
                                                                                  field=field,
                                                                                  field_value_dto=new_field_value_dto,
                                                                                  user=None)

                # and update the field value of this field which may be used for detection of fields depending on it
                current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value)

            # If save is not requested then do not update current_field_values.
            # Most likely in this case we detect only few requested fields and trying to comply the dependency
            # tree makes no big sense.
        except Exception as e:
            # Additionally logging here because the further compound exception will not contain the full stack trace.
            log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})',
                      exc_info=e)
            detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info()))

    if save:
        if updated_field_codes:
            user_fields_changed_set = set(updated_field_codes)
            if dependent_fields:
                user_fields_changed_set.update(dependent_fields)
            user_fields_changed = list(user_fields_changed_set)  # type: FieldSpec
        else:
            user_fields_changed = True

        fire_document_changed(sender=detect_and_cache_field_values_for_document,
                              log=log,
                              document=document,
                              changed_by_user=changed_by_user,
                              document_initial_load=document_initial_load,
                              system_fields_changed=system_fields_changed,
                              generic_fields_changed=generic_fields_changed,
                              user_fields_changed=user_fields_changed)
        if dependent_fields:
            msg = f'Recalculating dependent fields for {document.name}: '  # dependent_fields
            msg += ', '.join(dependent_fields)
            msg += '.\n\nSource fields data: \n'
            msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"'
                              for k in current_field_values])
            msg += '.\n\nCalculation results:\n'
            msg += '\n'.join(detecting_field_status)
            log.info(msg)

    if detection_errors:
        fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors])
        msg = f'There were errors while detecting fields:\n{fields_str}\n' + \
              f'for document {document.name} (#{document.pk}, type {document_type.code})\n'
        for f_code, f_type, ex, ex_stack in detection_errors:
            msg += f'\n{f_code}, {f_type}: {ex}'
        raise FieldDetectionError(msg)

    return res
Exemple #6
0
    def process(self, **kwargs):
        dst_field = kwargs['field']
        dst_field = DocumentField.objects.filter(pk=dst_field['pk']) \
            .prefetch_related('depends_on_fields') \
            .select_related(DST_FIELD_SIMILARITY_CONFIG_ATTR) \
            .first()  # type: DocumentField

        if not dst_field:
            raise RuntimeError('Document field not found: {0}'.format(kwargs['field']))

        proj = kwargs['project']
        proj_id = proj['pk'] if proj else None  # type:Optional[int]
        doc_query = Document.objects.filter(document_type=dst_field.document_type,
                                            project_id=proj_id) if proj_id \
            else Document.objects.filter(document_type=dst_field.document_type)

        config = getattr(dst_field, DST_FIELD_SIMILARITY_CONFIG_ATTR)  # type: DocumentSimilarityConfig

        config.self_validate()

        similarity_threshold = config.similarity_threshold
        feature_vector_fields = list(dst_field.depends_on_fields.all())
        feature_vector_field_codes = [f.code for f in feature_vector_fields]

        self.log_info('{field}: Min similarity: {threshold}'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()
        qr_doc_ids = doc_query.values_list('pk', flat=True)
        doc_ids_to_code_to_value = field_repo \
            .get_field_code_to_python_value_multiple_docs(document_type_id=dst_field.document_type_id,
                                                          doc_ids=qr_doc_ids,
                                                          field_codes_only=feature_vector_field_codes)

        field_values_list = list()
        for doc_id, values in doc_ids_to_code_to_value:
            values[FIELD_CODE_DOC_ID] = doc_id
            field_values_list.append(values)

        total_docs = len(field_values_list)

        self.set_push_steps(int(5 + total_docs / 100))

        self.push()
        self.log_info(
            '{field}: Building feature vectors for {n} documents'.format(field=dst_field.code, n=total_docs))

        vectorizer = document_feature_vector_pipeline(feature_vector_fields, use_field_codes=True)
        feature_vectors = vectorizer.fit_transform(field_values_list)

        self.push()
        self.log_info('{field}: Finding similar documents (similarity >= {threshold})'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        doc_ids_to_values = defaultdict(set)
        for x, doc_a_field_values in enumerate(field_values_list):
            doc_a_pk = doc_a_field_values[FIELD_CODE_DOC_ID]
            similarities = cosine_similarity(feature_vectors[x], feature_vectors)
            for y, doc_b_field_values in enumerate(field_values_list):
                doc_b_pk = doc_b_field_values[FIELD_CODE_DOC_ID]
                if doc_a_pk == doc_b_pk:
                    continue
                similarity = similarities[0, y]
                if similarity < similarity_threshold:
                    continue
                doc_ids_to_values[doc_a_pk].add(doc_b_pk)
                doc_ids_to_values[doc_b_pk].add(doc_a_pk)
            if x % 100 == 0:
                self.log_info('{field}: Checked for similarity {x} documents of {n}'
                              .format(field=dst_field.code, x=x + 1, n=total_docs))
                self.push()

        self.push()
        self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.'
                      .format(field=dst_field.code, n=len(doc_ids_to_values)))

        doc_ids_to_values = {doc_id: list(v) if v else None for doc_id, v in doc_ids_to_values}
        field_repo.store_values_one_field_many_docs_no_ants(field=dst_field, doc_ids_to_values=doc_ids_to_values)

        log = CeleryTaskLogger(self)
        for doc_id in doc_ids_to_values.keys():
            try:
                doc = Document.objects.get(pk=doc_id)
                signals.fire_document_changed(log=log, document=doc, changed_by_user=None, system_fields_changed=False,
                                              generic_fields_changed=False, user_fields_changed=[dst_field.code])
            except Exception as ex:
                self.log_error(f'Unable to fire doc id change event for doc #{doc_id}', exc_info=ex)
def cache_field_values(doc: Document,
                       suggested_field_values: Optional[
                           List[DetectedFieldValue]],
                       save: bool = True,
                       log: ProcessLogger = None,
                       changed_by_user: User = None,
                       system_fields_changed: bool = False,
                       generic_fields_changed: bool = False,
                       document_initial_load: bool = False) -> Dict[str, Any]:
    """
    Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document,
    converts them to the sortable DB-aware form and saves them to Document.field_values.
    :param doc:
    :param save:
    :param suggested_field_values:
    :param log
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param document_initial_load
    :return:
    """
    document_type = doc.document_type  # type: DocumentType
    # TODO: get/save field value for specific field
    all_fields = list(document_type.fields.all())

    related_info_field_uids = {
        f.uid
        for f in all_fields if f.is_related_info_field()
    }

    fields_to_field_values = {f: None for f in all_fields}

    for fv in doc.documentfieldvalue_set.all():
        if fv.removed_by_user:
            continue

        field = fv.field
        field_type = FIELD_TYPES_REGISTRY[fv.field.type]  # type: FieldType
        fields_to_field_values[field] = field_type \
            .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value)

    field_uids_to_field_values_db = {}

    for f in all_fields:  # type: DocumentField
        field_type = FIELD_TYPES_REGISTRY[f.type]  # type: FieldType
        v = fields_to_field_values[f]
        field_uids_to_field_values_db[
            f.uid] = field_type.merged_python_value_to_db(v)

    if suggested_field_values:
        field_codes_to_suggested_values = \
            merge_detected_field_values_to_python_value(suggested_field_values)  # type: Dict[str, Any]
    else:
        field_codes_to_suggested_values = None

    for f in all_fields:  # type: DocumentField
        field_type = f.get_field_type()  # type: FieldType
        if f.is_detectable():
            suggested_field_uid = Document.get_suggested_field_uid(f.uid)
            if field_codes_to_suggested_values:
                suggested_value_db = field_type.merged_python_value_to_db(
                    field_codes_to_suggested_values.get(f.code))
            else:
                suggested_value_db = doc.field_values.get(
                    suggested_field_uid) if doc.field_values else None

            # suggested_value_db can be list, None or int, Iterable validation should be here
            if isinstance(suggested_value_db,
                          Iterable) and f.is_related_info_field():
                suggested_value_db = len(suggested_value_db)
            field_uids_to_field_values_db[
                suggested_field_uid] = suggested_value_db

    if save:
        doc.field_values = {
            uid: len(value)
            if uid in related_info_field_uids and value is not None else value
            for uid, value in field_uids_to_field_values_db.items()
        }
        doc.save(update_fields=["field_values"])
        signals.fire_document_changed(
            sender=cache_field_values,
            changed_by_user=changed_by_user,
            log=log,
            document=doc,
            system_fields_changed=system_fields_changed,
            generic_fields_changed=generic_fields_changed,
            user_fields_changed=True,
            pre_detected_field_values=field_codes_to_suggested_values,
            document_initial_load=document_initial_load)

    return field_uids_to_field_values_db