def cache_doc_fields_task_impl(_sender, signal, documents, status_name: str, changed_by_user: User): from apps.rawdb.repository.raw_db_repository import RawDbRepository from apps.rawdb.tasks import cache_fields_for_docs_queryset from apps.task.tasks import call_task_func repo = RawDbRepository() doc_ids = list(documents.values_list('pk', flat=True)) repo.update_documents_status(doc_ids, status_name) call_task_func(cache_fields_for_docs_queryset, (documents, changed_by_user, False, True, True, None), changed_by_user.pk)
def update_documents_status_impl(sender, signal, documents: QuerySet, new_status_id: int, changed_by_user: User): from apps.rawdb.repository.raw_db_repository import RawDbRepository from apps.rawdb.tasks import plan_reindex_tasks_in_chunks repo = RawDbRepository() doc_ids = set(documents.values_list('pk', flat=True)) repo.update_documents_status(doc_ids, new_status_id) plan_reindex_tasks_in_chunks( doc_ids, changed_by_user.pk, cache_system_fields=[DocumentSystemField.status.value], cache_generic_fields=False, cache_user_fields=False)
def update_documents_assignees_impl(_sender, signal, documents, assignee_id: int, changed_by_user: User): from apps.rawdb.repository.raw_db_repository import RawDbRepository from apps.rawdb.tasks import cache_fields_for_docs_queryset from apps.task.tasks import call_task_func repo = RawDbRepository() doc_ids = list(documents.values_list('pk', flat=True)) old_field_values = { d.pk: { 'assignee_id': d.assignee_id, 'assignee_name': d.assignee.username if d.assignee else '' } for d in documents } repo.update_documents_assignees(doc_ids, assignee_id) task_ptrs = (documents, changed_by_user, False, True, True, None, old_field_values) call_task_func(cache_fields_for_docs_queryset, task_ptrs, changed_by_user.pk)
def get_values(self, log: ProcessLogger, field: DocumentField, doc: Document, text: str) \ -> List[Tuple[Any, Optional[int], Optional[int]]]: try: conf = getattr(field, DST_FIELD_SIMILARITY_CONFIG_ATTR ) # type: DocumentSimilarityConfig except DocumentSimilarityConfig.DoesNotExist: conf = None if conf: conf.self_validate() similarity_threshold = conf.similarity_threshold if conf else DEFAULT_SIMILARITY_TRESHOLD feature_vector_fields = field.depends_on_fields.all() date_constraint_field_code = conf.date_constraint_field.code if conf and conf.date_constraint_field else None date_constraint_days = conf.date_constraint_days if conf else DEFAULT_DATE_CONSTRAINT_DAYS document_type = doc.document_type feature_vector_field_codes = {f.code for f in feature_vector_fields} # TODO: replace with the corresponding method call when ready doc_field_values = dict() for fv in doc.documentfieldvalue_set \ .filter(field__code__in=feature_vector_field_codes.union({date_constraint_field_code})): if fv.removed_by_user: continue field = fv.field field_type = fv.field.get_field_type() # type: FieldType doc_field_values[field.code] = field_type \ .merge_multi_python_values(doc_field_values.get(field.code), fv.python_value) doc_field_values[FIELD_CODE_DOC_ID] = doc.pk doc_date = doc_field_values.get( date_constraint_field_code) if date_constraint_field_code else None if not doc_date: doc_date = doc.history.last().history_date date_constraint_field_code = FIELD_CODE_CREATE_DATE date_start = doc_date - timedelta(days=date_constraint_days) date_end = doc_date + timedelta(days=date_constraint_days) try: vectorizer = document_feature_vector_pipeline( feature_vector_fields, use_field_codes=True) rawdb = RawDbRepository() where = SQLClause( f'"{FIELD_CODE_DOC_ID}" != %s ' f'and "{date_constraint_field_code}" >= %s ' f'and "{date_constraint_field_code}" <= %s', [doc.pk, date_start, date_end]) field_values_list = list( rawdb.get_field_values( document_type=document_type, where_sql=where, field_codes=feature_vector_field_codes.union( {FIELD_CODE_DOC_ID, date_constraint_field_code}))) if not field_values_list: return [] field_values_list = [doc_field_values] + field_values_list feature_vectors = vectorizer.fit_transform(field_values_list) doc_feature_vectors = feature_vectors[0] except ValueError as ve: if 'empty vocabulary' in str(ve): log.info( f'Similarity: {field.code}: Vectorization got "empty vocabulary" probably no one of the docs ' f'contains any value in the feature vector fields.') return [] raise ve similarities = cosine_similarity(doc_feature_vectors, feature_vectors) # TODO: Think about removing usage of other_field_values_list here and switching it to generator # to avoid storing the list of all field values. We only need feature vectors but they have no doc id. res = list() # type: List[Tuple[Any, Optional[int], Optional[int]]]: for y, field_values in enumerate(field_values_list): other_doc_pk = field_values[FIELD_CODE_DOC_ID] if doc.pk == other_doc_pk: continue similarity = similarities[0, y] if similarity < similarity_threshold: continue res.append((other_doc_pk, None, None)) self._maybe_save_reverse_similarity_value( log=log, field=field, document=doc, other_doc_id=other_doc_pk) return res
def process(self, **kwargs): dst_field = kwargs['field'] dst_field = DocumentField.objects.filter(pk=dst_field['pk']) \ .prefetch_related('depends_on_fields') \ .select_related(DST_FIELD_SIMILARITY_CONFIG_ATTR) \ .first() # type: DocumentField if not dst_field: raise RuntimeError('Document field not found: {0}'.format(kwargs['field'])) config = getattr(dst_field, DST_FIELD_SIMILARITY_CONFIG_ATTR) # type: DocumentSimilarityConfig config.self_validate() similarity_threshold = config.similarity_threshold feature_vector_fields = dst_field.depends_on_fields.all() feature_vector_field_codes = {f.code for f in feature_vector_fields}.union({FIELD_CODE_DOC_ID}) self.log_info('{field}: Min similarity: {threshold}' .format(field=dst_field.code, threshold=similarity_threshold)) rawdb = RawDbRepository() field_values_list = list(rawdb.get_field_values(document_type=dst_field.document_type, field_codes=feature_vector_field_codes)) total_docs = len(field_values_list) self.set_push_steps(int(5 + total_docs / 100)) self.push() self.log_info( '{field}: Building feature vectors for {n} documents'.format(field=dst_field.code, n=total_docs)) vectorizer = document_feature_vector_pipeline(feature_vector_fields, use_field_codes=True) feature_vectors = vectorizer.fit_transform(field_values_list) self.push() self.log_info('{field}: Finding similar documents (similarity >= {threshold})' .format(field=dst_field.code, threshold=similarity_threshold)) dfvs = list() for x, doc_a_field_values in enumerate(field_values_list): doc_a_pk = doc_a_field_values[FIELD_CODE_DOC_ID] similarities = cosine_similarity(feature_vectors[x], feature_vectors) for y, doc_b_field_values in enumerate(field_values_list): doc_b_pk = doc_b_field_values[FIELD_CODE_DOC_ID] if doc_a_pk == doc_b_pk: continue similarity = similarities[0, y] if similarity < similarity_threshold: continue dfvs.append(DocumentFieldValue(document_id=doc_a_pk, value=doc_b_pk, field_id=dst_field.pk)) dfvs.append(DocumentFieldValue(document_id=doc_b_pk, value=doc_a_pk, field_id=dst_field.pk)) if x % 100 == 0: self.log_info('{field}: Checked for similarity {x} documents of {n}' .format(field=dst_field.code, x=x + 1, n=total_docs)) self.push() self.push() self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.' .format(field=dst_field.code, n=len(dfvs))) del_doc_batch_size = 100 for i in range(0, len(field_values_list), del_doc_batch_size): DocumentFieldValue.objects \ .filter(field_id=dst_field.pk) \ .filter(document_id__in={field_values[FIELD_CODE_DOC_ID] for field_values in field_values_list[i: i + del_doc_batch_size]}) \ .delete() DocumentFieldValue.objects.bulk_create(dfvs) self.push()
class FieldBasedMLOnlyFieldDetectionStrategy(FieldDetectionStrategy): code = DocumentField.VD_FIELD_BASED_ML_ONLY FIELD_REPOSITORY = RawDbRepository() @classmethod def uses_cached_document_field_values(cls, field): return True @classmethod def get_user_data( cls, field: DocumentField, project_ids: Optional[List[str]]) -> Optional[List[dict]]: qs_modified_document_ids = field_detection_utils.get_qs_active_modified_document_ids( field, project_ids) qs_finished_document_ids = field_detection_utils.get_qs_finished_document_ids( field.document_type, project_ids) return list( Document.objects.filter( pk__in=Q(Subquery(qs_modified_document_ids)) | Q(Subquery(qs_finished_document_ids))).values_list( 'field_values', flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) @classmethod def init_classifier(cls, field: DocumentField): init_script = field.classifier_init_script # type: str return init_classifier_impl(field.code, init_script) @classmethod def build_pipeline( cls, field: DocumentField, depends_on_fields: List[Tuple[str, str, str]] ) -> Tuple[Pipeline, List[str]]: transformer_list = [] feature_names_funcs = [] for field_uid, field_code, field_type in sorted(depends_on_fields, key=lambda t: t[1]): field_type = FIELD_TYPE_REGISTRY[field_type] # type: FieldType field_vect_steps = [('sel', FieldValueExtractor(field_uid, field_type))] field_vect_pipeline, field_feature_names_func = field_type.build_vectorization_pipeline( ) field_vect_steps.extend(field_vect_pipeline) transformer_list.append((field_code, Pipeline(field_vect_steps))) feature_names_funcs.append( wrap_feature_names_with_field_code(field_feature_names_func, field_code)) classifier = cls.init_classifier(field) return Pipeline([('vect', FeatureUnion(transformer_list)), ('clf', classifier)]), feature_names_funcs @classmethod def get_depends_on_uid_code_type( cls, field: DocumentField) -> List[Tuple[str, str, str]]: return list(field.depends_on_fields.all().values_list( 'uid', 'code', 'type')) @classmethod def remove_empty_fields(cls, depends_on_uid_code_type, train_data): uids = {r[0] for r in depends_on_uid_code_type} for d in train_data: non_zero_uids = set() for uid in uids: if d.get(uid): non_zero_uids.add(uid) uids.difference_update(non_zero_uids) if len(uids) == 0: break return [(uid, code, field_type) for uid, code, field_type in depends_on_uid_code_type if uid not in uids] @classmethod def get_categories(cls, field: DocumentField) -> List[str]: return sorted(field.get_choice_values()) @classmethod def train_document_field_detector_model(cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False, train_documents: Iterable[Document] = None) \ -> Optional[ClassifierModel]: field_type_adapter = field.get_field_type() # type: FieldType log.set_progress_steps_number(7) log.info('Training model for field #{0} ({1})...'.format( field.pk, field.code)) # Classifier: values of dependencies -> value of this field # Field types supported: only choice fields if not isinstance(field_type_adapter, ChoiceField): raise ValueError( 'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}' .format(field.code, field.uid, field_type_adapter.code)) # Lets find good values of depends-on fields suitable for using as train data. train_data = cls.get_train_values(field, train_data_project_ids, train_documents, use_only_confirmed_field_values) if not train_data: raise RuntimeError( 'Not enough train data for field {0} (#{1}). ' 'Need at least {2} approved or changed documents of type {3}.'. format(field.code, field.uid, settings.ML_TRAIN_DATA_SET_GROUP_LEN, field.document_type.code)) depends_on_fields_types = cls.get_depends_on_uid_code_type(field) depends_on_fields_types = cls.remove_empty_fields( depends_on_fields_types, train_data) pipeline, feature_names_funcs = cls.build_pipeline( field, depends_on_fields_types) # type: Pipeline, List[Callable] categories = cls.get_categories(field) category_names_to_indexes = {c: i for i, c in enumerate(categories)} log.step_progress() log.info( 'Collecting feature rows from train and test documents in dict form...' ) # When tried to use sklearn shuffling something went wrong, leaving manual methods for a while. random.shuffle(train_data) # TODO: use sklearn methods for splitting train/test data and shuffling test_size = 0.2 train_feature_data = list() train_target_data = list() for doc_field_values in train_data: field_value = doc_field_values.get(field.uid) del doc_field_values[field.uid] field_value_idx = category_names_to_indexes.get( field_value) if field_value else None if field_value_idx is None: field_value_idx = len(categories) train_feature_data.append(doc_field_values) train_target_data.append(field_value_idx) is_index = math.floor(test_size * len(train_data)) test_oos_feature_data = train_feature_data[:is_index] test_oos_target_data = train_target_data[:is_index] train_feature_data = train_feature_data[is_index:] train_target_data = train_target_data[is_index:] test_is_feature_data = train_feature_data # [:is_index] test_is_target_data = train_target_data # [:is_index] log.step_progress() log.info('Training the model...') model = pipeline.fit(train_feature_data, train_target_data) log.step_progress() log.info('Testing the model...') cm = ClassifierModel() cm.document_field = field predicted_oos = pipeline.predict(test_oos_feature_data) cm.classifier_accuracy_report_out_of_sample = classification_report( test_oos_target_data, predicted_oos, target_names=categories) predicted_is = pipeline.predict(test_is_feature_data) cm.classifier_accuracy_report_in_sample = classification_report( test_is_target_data, predicted_is, target_names=categories) log.step_progress() log.info('Saving ClassifierModel instance...') feature_names = [] for f in feature_names_funcs: feature_names.extend(f()) cm.set_trained_model_obj({ 'model': model, 'categories': categories, 'feature_names': feature_names }) log.step_progress() log.info('Finished.') return cm @classmethod def get_train_values( cls, field: DocumentField, train_data_project_ids: Optional[List], train_documents: Iterable[Document], use_only_confirmed_field_values: bool) -> List[Dict[str, Any]]: repo = FieldBasedMLOnlyFieldDetectionStrategy.FIELD_REPOSITORY if train_documents: return list( repo.get_documents_field_values_by_uid(train_documents)) if train_data_project_ids and not use_only_confirmed_field_values: return list( repo.get_project_documents_field_values_by_uid( train_data_project_ids, settings.ML_TRAIN_DATA_SET_GROUP_LEN, field.document_type)) return list(cls.get_user_data(field, train_data_project_ids)) @classmethod def maybe_detect_with_stop_words( cls, field: DocumentField, cached_fields: Dict[str, Any]) -> Optional[List[DetectedFieldValue]]: if field.stop_words: depends_on_fields = list(field.depends_on_fields.all()) depends_on_full_text = [] if not any(cached_fields): return None for df in depends_on_fields: # type: DocumentField field_type_adapter = FIELD_TYPE_REGISTRY[ df.type] # type: FieldType v = field_type_adapter.merged_db_value_to_python( cached_fields.get(df.uid)) if v: depends_on_full_text.append(str(v)) detected_with_stop_words, detected_values = \ detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text)) if detected_with_stop_words: return detected_values or list() return None @classmethod def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: detected_values = cls.maybe_detect_with_stop_words( field, cached_fields) if detected_values is not None: return detected_values try: classifier_model = ClassifierModel.objects.get( document_field=field) obj = classifier_model.get_trained_model_obj( ) # type: Dict[str, Any] model = obj['model'] categories = obj['categories'] doc_field_vals = cached_fields predicted = model.predict([doc_field_vals]) target_index = predicted[0] target_name = categories[target_index] if 0 <= target_index < len( categories) else None dfv = DetectedFieldValue(field, target_name) return [dfv] except ClassifierModel.DoesNotExist as e: raise e