def _validate_choice_values_removed(self, context: dict) -> None: saved_field = self._get_saved_field(context) if not saved_field or not saved_field.is_choice_field( ) or not self.object.is_choice_field(): return err_msg = '' invalid_choices = self._get_invalid_choices(saved_field) if self._is_allow_values_not_specified_in_choices_was_unset( saved_field): err_msg += '"Allow values not specified in choices" flag is unset in the the config being imported. ' if invalid_choices: invalid_choices = [ '"{0}"'.format(invalid_choice) for invalid_choice in invalid_choices ] err_msg += 'The following choice values are missing in the config being imported: {0}. ' \ .format(', '.join(invalid_choices)) if err_msg: invalid_values_count = self.object.get_invalid_choice_values( ).count() user_values_count = 0 detected_values_count = 0 if invalid_values_count > 0: user_values_qs = self.object.get_invalid_choice_values() user_values_count = DocumentFieldValue.filter_user_values( user_values_qs).count() detected_values_count = self._get_detected_values_count( invalid_values_count, user_values_count) err_msg += 'Number of invalid values: user entered values {0}, automatically detected values {1}.' \ ' You need to set force auto-fixes option to continue (this option will remove all invalid' \ ' values) or make manual updates.'.format(user_values_count, detected_values_count) err_msg = 'Unable to update field #{0} "{1}". {2}'.format( self.pk, self.object.code, err_msg) raise ValidationError(err_msg)
def _validate_critical_properties_changed(self, context: dict) -> None: saved_field = self._get_saved_field(context) if not saved_field: return err_msg = '' new_field_type = self.object.type old_document_type_pk = self.to_str_if_uuid(self._get_document_type_pk(saved_field)) new_document_type_pk = self.to_str_if_uuid(self.document_type_pk) old_field_type = saved_field.type if old_document_type_pk != new_document_type_pk: err_msg += 'Document type has changed, old document type id is #{0}, new document type id is #{1}. ' \ .format(old_document_type_pk, self.document_type_pk) if old_field_type != new_field_type: err_msg += 'Field type has changed, old field type is "{0}", new field type is "{1}". ' \ .format(self._get_field_type_title(old_field_type), self._get_field_type_title(new_field_type)) if err_msg: err_msg = 'Unable to update field #{0} "{1}". {2}'.format(self.pk, self.object.code, err_msg) values_count = DocumentFieldValue.objects.filter(field=self.object).count() user_values_count = 0 detected_values_count = 0 if values_count > 0: user_values_qs = DocumentFieldValue.objects.filter(field=self.object) user_values_count = DocumentFieldValue.filter_user_values(user_values_qs).count() detected_values_count = self._get_detected_values_count(values_count, user_values_count) err_msg += 'Existing document field values become invalid and will be removed. User entered values {0},' \ ' automatically detected values {1}. You need to set force auto-fixes option to continue' \ ' (this option will remove all values for this field) or make manual updates.' \ .format(user_values_count, detected_values_count) raise ValidationError(err_msg)
def _maybe_save_reverse_similarity_value(self, log: ProcessLogger, field: DocumentField, document: Document, other_doc_id) -> bool: if not DocumentFieldValue.objects.filter(document_id=other_doc_id, field=field, value=document.pk).exists(): other_document = Document.all_objects.get(pk=other_doc_id) DocumentFieldValue(document=other_document, field=field, value=document.pk).save() cache_document_fields(log=log, document=other_document, cache_generic_fields=False, cache_user_fields=True)
def process(self, **kwargs): dst_field = kwargs['field'] dst_field = DocumentField.objects.filter(pk=dst_field['pk']) \ .prefetch_related('depends_on_fields') \ .select_related(DST_FIELD_SIMILARITY_CONFIG_ATTR) \ .first() # type: DocumentField if not dst_field: raise RuntimeError('Document field not found: {0}'.format(kwargs['field'])) config = getattr(dst_field, DST_FIELD_SIMILARITY_CONFIG_ATTR) # type: DocumentSimilarityConfig config.self_validate() similarity_threshold = config.similarity_threshold feature_vector_fields = dst_field.depends_on_fields.all() feature_vector_field_codes = {f.code for f in feature_vector_fields}.union({FIELD_CODE_DOC_ID}) self.log_info('{field}: Min similarity: {threshold}' .format(field=dst_field.code, threshold=similarity_threshold)) rawdb = RawDbRepository() field_values_list = list(rawdb.get_field_values(document_type=dst_field.document_type, field_codes=feature_vector_field_codes)) total_docs = len(field_values_list) self.set_push_steps(int(5 + total_docs / 100)) self.push() self.log_info( '{field}: Building feature vectors for {n} documents'.format(field=dst_field.code, n=total_docs)) vectorizer = document_feature_vector_pipeline(feature_vector_fields, use_field_codes=True) feature_vectors = vectorizer.fit_transform(field_values_list) self.push() self.log_info('{field}: Finding similar documents (similarity >= {threshold})' .format(field=dst_field.code, threshold=similarity_threshold)) dfvs = list() for x, doc_a_field_values in enumerate(field_values_list): doc_a_pk = doc_a_field_values[FIELD_CODE_DOC_ID] similarities = cosine_similarity(feature_vectors[x], feature_vectors) for y, doc_b_field_values in enumerate(field_values_list): doc_b_pk = doc_b_field_values[FIELD_CODE_DOC_ID] if doc_a_pk == doc_b_pk: continue similarity = similarities[0, y] if similarity < similarity_threshold: continue dfvs.append(DocumentFieldValue(document_id=doc_a_pk, value=doc_b_pk, field_id=dst_field.pk)) dfvs.append(DocumentFieldValue(document_id=doc_b_pk, value=doc_a_pk, field_id=dst_field.pk)) if x % 100 == 0: self.log_info('{field}: Checked for similarity {x} documents of {n}' .format(field=dst_field.code, x=x + 1, n=total_docs)) self.push() self.push() self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.' .format(field=dst_field.code, n=len(dfvs))) del_doc_batch_size = 100 for i in range(0, len(field_values_list), del_doc_batch_size): DocumentFieldValue.objects \ .filter(field_id=dst_field.pk) \ .filter(document_id__in={field_values[FIELD_CODE_DOC_ID] for field_values in field_values_list[i: i + del_doc_batch_size]}) \ .delete() DocumentFieldValue.objects.bulk_create(dfvs) self.push()