def process(self, **kwargs): n_clusters = kwargs.get('n_clusters', 3) method = kwargs.get('method', 'kmeans') cluster_by = kwargs.get('cluster_by', 'term') self.project_clustering_id = kwargs.get('project_clustering_id') project_clustering = ProjectClustering.objects.get(pk=self.project_clustering_id) project_clustering.status = PENDING project_clustering.task = self.task project_clustering.save() project = project_clustering.project self.log_info('Start clustering documents for project id={}'.format(project.id)) self.log_info('Clustering method: "{}", n_clusters={}'.format(method, n_clusters)) self.log_info('Cluster by: {}'.format(str(cluster_by))) self.set_push_steps(4) self.push() # clear previous clusters, their tasks and cluster sessions project.drop_clusters(exclude_task_ids={self.request.id}, exclude_project_clustering_id=self.project_clustering_id) self.push() cluster_model = ClusterDocuments(project_id=project.id, cluster_algorithm=method, n_clusters=n_clusters, cluster_by=cluster_by, use_default_name=True, log_message=self.log_wo_flooding) result = cluster_model.run() project_clustering.metadata = result.metadata project_clustering.save() project_clustering.document_clusters.add(*result.metadata['cluster_obj_ids']) self.push() self.log_info('Clustering completed. Updating document cache.') log = CeleryTaskLogger(self) for doc in Document.objects.filter(project__pk=project.id): signals.fire_document_changed(sender=self, log=log, document=doc, changed_by_user=None, system_fields_changed=False, user_fields_changed=False, generic_fields_changed=[DocumentGenericField.cluster_id.value]) project_clustering.status = SUCCESS project_clustering.save() self.push() self.log_info('Finished.') return result.metadata
def cache_generic_values(doc: Document, save: bool = True, log: ProcessLogger = None, fire_doc_changed_event: bool = True): if save: if fire_doc_changed_event: signals.fire_document_changed(sender=cache_generic_values, log=log, document=doc, system_fields_changed=False, generic_fields_changed=True, user_fields_changed=False, pre_detected_field_values=None)
def load_doc(task: ExtendedTask, document: Document, field_values_alias_to_value: Dict[str, Any], run_detect_field_values: bool, field_owners: Dict[str, User] = None): field_owners = field_owners if field_owners else {} fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value) log = CeleryTaskLogger(task) import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() with transaction.atomic(): new_document = document.pk is None document.save(force_insert=new_document) DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None}) for field, value_dto in fields_to_values.items(): field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=value_dto, user=field_owners.get(field.code)) if run_detect_field_values: field_detection.detect_and_cache_field_values_for_document(log=log, document=document, save=True, clear_old_values=False) else: signals.fire_document_changed(sender=task, log=log, document=document, changed_by_user=None, document_initial_load=True, system_fields_changed=True, generic_fields_changed=True, user_fields_changed=True) task.log_info('Loaded {0} field values for document #{1} ({2}): {3}' .format(len(fields_to_values), document.pk, document.name, ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
def process_document_changed(task: ExtendedTask, doc_id: int, system_fields_changed: FieldSpec = True, generic_fields_changed: FieldSpec = True, user_fields_changed: bool = True, changed_by_user_id: int = None): from apps.document.repository.document_field_repository import DocumentFieldRepository dfr = DocumentFieldRepository() doc = Document.objects.get(pk=doc_id) # type: Document changed_by_user = User.objects.get(pk=changed_by_user_id) if changed_by_user_id is not None else None if DocumentSystemField.status.specified_in(system_fields_changed): dfr.delete_hidden_field_values_if_needed(doc, event_sender=task) fire_document_changed(sender=task, log=CeleryTaskLogger(task), document=doc, changed_by_user=changed_by_user, document_initial_load=False, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=user_fields_changed)
def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True, clear_old_values: bool = True, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False, ignore_field_codes: Set[str] = None, updated_field_codes: List[str] = None, skip_modified_values: bool = True): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :param clear_old_values: :param changed_by_user :param system_fields_changed :param generic_fields_changed :param ignore_field_codes :param document_initial_load :param updated_field_codes - if set, we search for changed and dependent fields only :param skip_modified_values - don't overwrite field values overwritten by user :return: """ import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() if save and document.status and not document.status.is_active: raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n' f'Document: {document.name} (#{document.pk})') document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields] dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \ if updated_field_codes else None sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields} # type: Dict[str, DocumentField] log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n' f'Updated fields: {updated_field_codes or "All"}.\n' f'Dependent fields to be detected: {dependent_fields or "All"}.\n' f'Ignored fields: {ignore_field_codes}.') if updated_field_codes: sorted_codes = [c for c in sorted_codes if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)] elif ignore_field_codes: sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes] current_field_values = {f.code: None for f in all_fields} # we may get values for fields required for sorted_codes, regarding # further dependencies # or we may just get all fields' values (field_codes_only=None) actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk, doc_id=document.pk, field_codes_only=None) current_field_values.update(actual_field_values) res = list() detecting_field_status = [] # type:List[str] detection_errors = [] # type:List[Tuple[str, str, Exception, Any]] # do not touch field values modified by user skip_codes = set() if skip_modified_values: skip_codes = set(list(FieldValue.objects.filter( modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True))) if updated_field_codes: # these fields have to be deleted despite being set by user # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True) skip_codes -= set(updated_field_codes) if clear_old_values: field_repo.delete_document_field_values(document.pk, list(skip_codes), updated_field_codes) for field_code in sorted_codes: if field_code in skip_codes: continue field = all_fields_code_to_field[field_code] # type: DocumentField typed_field = TypedField.by(field) # type: TypedField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy try: new_field_value_dto = field_detection_strategy.detect_field_value(log=log, doc=document, field=field, field_code_to_value=current_field_values) if not new_field_value_dto: detecting_field_status.append(f"No new value's gotten for '{field.code}'") continue if is_unit_limit_exceeded(new_field_value_dto, field, document): continue detecting_field_status.append( f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'") # now merge the detection results with the current DB state if save: # user = None here to store detected values as owned by system allowing further overwriting field_value, annotations = field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=new_field_value_dto, user=None) # and update the field value of this field which may be used for detection of fields depending on it current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value) # If save is not requested then do not update current_field_values. # Most likely in this case we detect only few requested fields and trying to comply the dependency # tree makes no big sense. except Exception as e: # Additionally logging here because the further compound exception will not contain the full stack trace. log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})', exc_info=e) detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info())) if save: if updated_field_codes: user_fields_changed_set = set(updated_field_codes) if dependent_fields: user_fields_changed_set.update(dependent_fields) user_fields_changed = list(user_fields_changed_set) # type: FieldSpec else: user_fields_changed = True fire_document_changed(sender=detect_and_cache_field_values_for_document, log=log, document=document, changed_by_user=changed_by_user, document_initial_load=document_initial_load, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=user_fields_changed) if dependent_fields: msg = f'Recalculating dependent fields for {document.name}: ' # dependent_fields msg += ', '.join(dependent_fields) msg += '.\n\nSource fields data: \n' msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"' for k in current_field_values]) msg += '.\n\nCalculation results:\n' msg += '\n'.join(detecting_field_status) log.info(msg) if detection_errors: fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors]) msg = f'There were errors while detecting fields:\n{fields_str}\n' + \ f'for document {document.name} (#{document.pk}, type {document_type.code})\n' for f_code, f_type, ex, ex_stack in detection_errors: msg += f'\n{f_code}, {f_type}: {ex}' raise FieldDetectionError(msg) return res
def process(self, **kwargs): dst_field = kwargs['field'] dst_field = DocumentField.objects.filter(pk=dst_field['pk']) \ .prefetch_related('depends_on_fields') \ .select_related(DST_FIELD_SIMILARITY_CONFIG_ATTR) \ .first() # type: DocumentField if not dst_field: raise RuntimeError('Document field not found: {0}'.format(kwargs['field'])) proj = kwargs['project'] proj_id = proj['pk'] if proj else None # type:Optional[int] doc_query = Document.objects.filter(document_type=dst_field.document_type, project_id=proj_id) if proj_id \ else Document.objects.filter(document_type=dst_field.document_type) config = getattr(dst_field, DST_FIELD_SIMILARITY_CONFIG_ATTR) # type: DocumentSimilarityConfig config.self_validate() similarity_threshold = config.similarity_threshold feature_vector_fields = list(dst_field.depends_on_fields.all()) feature_vector_field_codes = [f.code for f in feature_vector_fields] self.log_info('{field}: Min similarity: {threshold}' .format(field=dst_field.code, threshold=similarity_threshold)) import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() qr_doc_ids = doc_query.values_list('pk', flat=True) doc_ids_to_code_to_value = field_repo \ .get_field_code_to_python_value_multiple_docs(document_type_id=dst_field.document_type_id, doc_ids=qr_doc_ids, field_codes_only=feature_vector_field_codes) field_values_list = list() for doc_id, values in doc_ids_to_code_to_value: values[FIELD_CODE_DOC_ID] = doc_id field_values_list.append(values) total_docs = len(field_values_list) self.set_push_steps(int(5 + total_docs / 100)) self.push() self.log_info( '{field}: Building feature vectors for {n} documents'.format(field=dst_field.code, n=total_docs)) vectorizer = document_feature_vector_pipeline(feature_vector_fields, use_field_codes=True) feature_vectors = vectorizer.fit_transform(field_values_list) self.push() self.log_info('{field}: Finding similar documents (similarity >= {threshold})' .format(field=dst_field.code, threshold=similarity_threshold)) doc_ids_to_values = defaultdict(set) for x, doc_a_field_values in enumerate(field_values_list): doc_a_pk = doc_a_field_values[FIELD_CODE_DOC_ID] similarities = cosine_similarity(feature_vectors[x], feature_vectors) for y, doc_b_field_values in enumerate(field_values_list): doc_b_pk = doc_b_field_values[FIELD_CODE_DOC_ID] if doc_a_pk == doc_b_pk: continue similarity = similarities[0, y] if similarity < similarity_threshold: continue doc_ids_to_values[doc_a_pk].add(doc_b_pk) doc_ids_to_values[doc_b_pk].add(doc_a_pk) if x % 100 == 0: self.log_info('{field}: Checked for similarity {x} documents of {n}' .format(field=dst_field.code, x=x + 1, n=total_docs)) self.push() self.push() self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.' .format(field=dst_field.code, n=len(doc_ids_to_values))) doc_ids_to_values = {doc_id: list(v) if v else None for doc_id, v in doc_ids_to_values} field_repo.store_values_one_field_many_docs_no_ants(field=dst_field, doc_ids_to_values=doc_ids_to_values) log = CeleryTaskLogger(self) for doc_id in doc_ids_to_values.keys(): try: doc = Document.objects.get(pk=doc_id) signals.fire_document_changed(log=log, document=doc, changed_by_user=None, system_fields_changed=False, generic_fields_changed=False, user_fields_changed=[dst_field.code]) except Exception as ex: self.log_error(f'Unable to fire doc id change event for doc #{doc_id}', exc_info=ex)
def cache_field_values(doc: Document, suggested_field_values: Optional[ List[DetectedFieldValue]], save: bool = True, log: ProcessLogger = None, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False) -> Dict[str, Any]: """ Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document, converts them to the sortable DB-aware form and saves them to Document.field_values. :param doc: :param save: :param suggested_field_values: :param log :param changed_by_user :param system_fields_changed :param generic_fields_changed :param document_initial_load :return: """ document_type = doc.document_type # type: DocumentType # TODO: get/save field value for specific field all_fields = list(document_type.fields.all()) related_info_field_uids = { f.uid for f in all_fields if f.is_related_info_field() } fields_to_field_values = {f: None for f in all_fields} for fv in doc.documentfieldvalue_set.all(): if fv.removed_by_user: continue field = fv.field field_type = FIELD_TYPES_REGISTRY[fv.field.type] # type: FieldType fields_to_field_values[field] = field_type \ .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value) field_uids_to_field_values_db = {} for f in all_fields: # type: DocumentField field_type = FIELD_TYPES_REGISTRY[f.type] # type: FieldType v = fields_to_field_values[f] field_uids_to_field_values_db[ f.uid] = field_type.merged_python_value_to_db(v) if suggested_field_values: field_codes_to_suggested_values = \ merge_detected_field_values_to_python_value(suggested_field_values) # type: Dict[str, Any] else: field_codes_to_suggested_values = None for f in all_fields: # type: DocumentField field_type = f.get_field_type() # type: FieldType if f.is_detectable(): suggested_field_uid = Document.get_suggested_field_uid(f.uid) if field_codes_to_suggested_values: suggested_value_db = field_type.merged_python_value_to_db( field_codes_to_suggested_values.get(f.code)) else: suggested_value_db = doc.field_values.get( suggested_field_uid) if doc.field_values else None # suggested_value_db can be list, None or int, Iterable validation should be here if isinstance(suggested_value_db, Iterable) and f.is_related_info_field(): suggested_value_db = len(suggested_value_db) field_uids_to_field_values_db[ suggested_field_uid] = suggested_value_db if save: doc.field_values = { uid: len(value) if uid in related_info_field_uids and value is not None else value for uid, value in field_uids_to_field_values_db.items() } doc.save(update_fields=["field_values"]) signals.fire_document_changed( sender=cache_field_values, changed_by_user=changed_by_user, log=log, document=doc, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=True, pre_detected_field_values=field_codes_to_suggested_values, document_initial_load=document_initial_load) return field_uids_to_field_values_db