コード例 #1
0
def cache_doc_fields_task_impl(_sender, signal, documents, status_name: str,
                               changed_by_user: User):
    from apps.rawdb.repository.raw_db_repository import RawDbRepository
    from apps.rawdb.tasks import cache_fields_for_docs_queryset
    from apps.task.tasks import call_task_func
    repo = RawDbRepository()
    doc_ids = list(documents.values_list('pk', flat=True))
    repo.update_documents_status(doc_ids, status_name)

    call_task_func(cache_fields_for_docs_queryset,
                   (documents, changed_by_user, False, True, True, None),
                   changed_by_user.pk)
コード例 #2
0
def update_documents_status_impl(sender, signal, documents: QuerySet,
                                 new_status_id: int, changed_by_user: User):
    from apps.rawdb.repository.raw_db_repository import RawDbRepository
    from apps.rawdb.tasks import plan_reindex_tasks_in_chunks
    repo = RawDbRepository()
    doc_ids = set(documents.values_list('pk', flat=True))
    repo.update_documents_status(doc_ids, new_status_id)
    plan_reindex_tasks_in_chunks(
        doc_ids,
        changed_by_user.pk,
        cache_system_fields=[DocumentSystemField.status.value],
        cache_generic_fields=False,
        cache_user_fields=False)
コード例 #3
0
def update_documents_assignees_impl(_sender, signal, documents,
                                    assignee_id: int, changed_by_user: User):
    from apps.rawdb.repository.raw_db_repository import RawDbRepository
    from apps.rawdb.tasks import cache_fields_for_docs_queryset
    from apps.task.tasks import call_task_func
    repo = RawDbRepository()
    doc_ids = list(documents.values_list('pk', flat=True))

    old_field_values = {
        d.pk: {
            'assignee_id': d.assignee_id,
            'assignee_name': d.assignee.username if d.assignee else ''
        }
        for d in documents
    }
    repo.update_documents_assignees(doc_ids, assignee_id)
    task_ptrs = (documents, changed_by_user, False, True, True, None,
                 old_field_values)
    call_task_func(cache_fields_for_docs_queryset, task_ptrs,
                   changed_by_user.pk)
コード例 #4
0
    def get_values(self, log: ProcessLogger, field: DocumentField, doc: Document, text: str) \
            -> List[Tuple[Any, Optional[int], Optional[int]]]:

        try:
            conf = getattr(field, DST_FIELD_SIMILARITY_CONFIG_ATTR
                           )  # type: DocumentSimilarityConfig
        except DocumentSimilarityConfig.DoesNotExist:
            conf = None

        if conf:
            conf.self_validate()

        similarity_threshold = conf.similarity_threshold if conf else DEFAULT_SIMILARITY_TRESHOLD
        feature_vector_fields = field.depends_on_fields.all()
        date_constraint_field_code = conf.date_constraint_field.code if conf and conf.date_constraint_field else None
        date_constraint_days = conf.date_constraint_days if conf else DEFAULT_DATE_CONSTRAINT_DAYS
        document_type = doc.document_type

        feature_vector_field_codes = {f.code for f in feature_vector_fields}

        # TODO: replace with the corresponding method call when ready
        doc_field_values = dict()
        for fv in doc.documentfieldvalue_set \
                .filter(field__code__in=feature_vector_field_codes.union({date_constraint_field_code})):
            if fv.removed_by_user:
                continue

            field = fv.field
            field_type = fv.field.get_field_type()  # type: FieldType
            doc_field_values[field.code] = field_type \
                .merge_multi_python_values(doc_field_values.get(field.code), fv.python_value)
        doc_field_values[FIELD_CODE_DOC_ID] = doc.pk

        doc_date = doc_field_values.get(
            date_constraint_field_code) if date_constraint_field_code else None
        if not doc_date:
            doc_date = doc.history.last().history_date
            date_constraint_field_code = FIELD_CODE_CREATE_DATE

        date_start = doc_date - timedelta(days=date_constraint_days)
        date_end = doc_date + timedelta(days=date_constraint_days)

        try:
            vectorizer = document_feature_vector_pipeline(
                feature_vector_fields, use_field_codes=True)

            rawdb = RawDbRepository()
            where = SQLClause(
                f'"{FIELD_CODE_DOC_ID}" != %s '
                f'and "{date_constraint_field_code}" >= %s '
                f'and "{date_constraint_field_code}" <= %s',
                [doc.pk, date_start, date_end])

            field_values_list = list(
                rawdb.get_field_values(
                    document_type=document_type,
                    where_sql=where,
                    field_codes=feature_vector_field_codes.union(
                        {FIELD_CODE_DOC_ID, date_constraint_field_code})))

            if not field_values_list:
                return []

            field_values_list = [doc_field_values] + field_values_list
            feature_vectors = vectorizer.fit_transform(field_values_list)
            doc_feature_vectors = feature_vectors[0]
        except ValueError as ve:
            if 'empty vocabulary' in str(ve):
                log.info(
                    f'Similarity: {field.code}: Vectorization got "empty vocabulary" probably no one of the docs '
                    f'contains any value in the feature vector fields.')
                return []
            raise ve

        similarities = cosine_similarity(doc_feature_vectors, feature_vectors)

        # TODO: Think about removing usage of other_field_values_list here and switching it to generator
        # to avoid storing the list of all field values. We only need feature vectors but they have no doc id.
        res = list()  # type: List[Tuple[Any, Optional[int], Optional[int]]]:
        for y, field_values in enumerate(field_values_list):
            other_doc_pk = field_values[FIELD_CODE_DOC_ID]
            if doc.pk == other_doc_pk:
                continue
            similarity = similarities[0, y]
            if similarity < similarity_threshold:
                continue
            res.append((other_doc_pk, None, None))
            self._maybe_save_reverse_similarity_value(
                log=log, field=field, document=doc, other_doc_id=other_doc_pk)

        return res
コード例 #5
0
    def process(self, **kwargs):
        dst_field = kwargs['field']
        dst_field = DocumentField.objects.filter(pk=dst_field['pk']) \
            .prefetch_related('depends_on_fields') \
            .select_related(DST_FIELD_SIMILARITY_CONFIG_ATTR) \
            .first()  # type: DocumentField

        if not dst_field:
            raise RuntimeError('Document field not found: {0}'.format(kwargs['field']))

        config = getattr(dst_field, DST_FIELD_SIMILARITY_CONFIG_ATTR)  # type: DocumentSimilarityConfig

        config.self_validate()

        similarity_threshold = config.similarity_threshold
        feature_vector_fields = dst_field.depends_on_fields.all()
        feature_vector_field_codes = {f.code for f in feature_vector_fields}.union({FIELD_CODE_DOC_ID})

        self.log_info('{field}: Min similarity: {threshold}'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        rawdb = RawDbRepository()
        field_values_list = list(rawdb.get_field_values(document_type=dst_field.document_type,
                                                        field_codes=feature_vector_field_codes))

        total_docs = len(field_values_list)

        self.set_push_steps(int(5 + total_docs / 100))

        self.push()
        self.log_info(
            '{field}: Building feature vectors for {n} documents'.format(field=dst_field.code, n=total_docs))

        vectorizer = document_feature_vector_pipeline(feature_vector_fields, use_field_codes=True)
        feature_vectors = vectorizer.fit_transform(field_values_list)

        self.push()
        self.log_info('{field}: Finding similar documents (similarity >= {threshold})'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        dfvs = list()
        for x, doc_a_field_values in enumerate(field_values_list):
            doc_a_pk = doc_a_field_values[FIELD_CODE_DOC_ID]
            similarities = cosine_similarity(feature_vectors[x], feature_vectors)
            for y, doc_b_field_values in enumerate(field_values_list):
                doc_b_pk = doc_b_field_values[FIELD_CODE_DOC_ID]
                if doc_a_pk == doc_b_pk:
                    continue
                similarity = similarities[0, y]
                if similarity < similarity_threshold:
                    continue
                dfvs.append(DocumentFieldValue(document_id=doc_a_pk, value=doc_b_pk, field_id=dst_field.pk))
                dfvs.append(DocumentFieldValue(document_id=doc_b_pk, value=doc_a_pk, field_id=dst_field.pk))
            if x % 100 == 0:
                self.log_info('{field}: Checked for similarity {x} documents of {n}'
                              .format(field=dst_field.code, x=x + 1, n=total_docs))
                self.push()

        self.push()
        self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.'
                      .format(field=dst_field.code, n=len(dfvs)))

        del_doc_batch_size = 100
        for i in range(0, len(field_values_list), del_doc_batch_size):
            DocumentFieldValue.objects \
                .filter(field_id=dst_field.pk) \
                .filter(document_id__in={field_values[FIELD_CODE_DOC_ID] for field_values
                                         in field_values_list[i: i + del_doc_batch_size]}) \
                .delete()
        DocumentFieldValue.objects.bulk_create(dfvs)
        self.push()
class FieldBasedMLOnlyFieldDetectionStrategy(FieldDetectionStrategy):
    code = DocumentField.VD_FIELD_BASED_ML_ONLY
    FIELD_REPOSITORY = RawDbRepository()

    @classmethod
    def uses_cached_document_field_values(cls, field):
        return True

    @classmethod
    def get_user_data(
            cls, field: DocumentField,
            project_ids: Optional[List[str]]) -> Optional[List[dict]]:
        qs_modified_document_ids = field_detection_utils.get_qs_active_modified_document_ids(
            field, project_ids)
        qs_finished_document_ids = field_detection_utils.get_qs_finished_document_ids(
            field.document_type, project_ids)

        return list(
            Document.objects.filter(
                pk__in=Q(Subquery(qs_modified_document_ids))
                | Q(Subquery(qs_finished_document_ids))).values_list(
                    'field_values',
                    flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])

    @classmethod
    def init_classifier(cls, field: DocumentField):
        init_script = field.classifier_init_script  # type: str

        return init_classifier_impl(field.code, init_script)

    @classmethod
    def build_pipeline(
        cls, field: DocumentField, depends_on_fields: List[Tuple[str, str,
                                                                 str]]
    ) -> Tuple[Pipeline, List[str]]:

        transformer_list = []
        feature_names_funcs = []
        for field_uid, field_code, field_type in sorted(depends_on_fields,
                                                        key=lambda t: t[1]):
            field_type = FIELD_TYPE_REGISTRY[field_type]  # type: FieldType

            field_vect_steps = [('sel',
                                 FieldValueExtractor(field_uid, field_type))]

            field_vect_pipeline, field_feature_names_func = field_type.build_vectorization_pipeline(
            )

            field_vect_steps.extend(field_vect_pipeline)

            transformer_list.append((field_code, Pipeline(field_vect_steps)))

            feature_names_funcs.append(
                wrap_feature_names_with_field_code(field_feature_names_func,
                                                   field_code))

        classifier = cls.init_classifier(field)

        return Pipeline([('vect', FeatureUnion(transformer_list)),
                         ('clf', classifier)]), feature_names_funcs

    @classmethod
    def get_depends_on_uid_code_type(
            cls, field: DocumentField) -> List[Tuple[str, str, str]]:
        return list(field.depends_on_fields.all().values_list(
            'uid', 'code', 'type'))

    @classmethod
    def remove_empty_fields(cls, depends_on_uid_code_type, train_data):
        uids = {r[0] for r in depends_on_uid_code_type}
        for d in train_data:
            non_zero_uids = set()
            for uid in uids:
                if d.get(uid):
                    non_zero_uids.add(uid)
            uids.difference_update(non_zero_uids)
            if len(uids) == 0:
                break

        return [(uid, code, field_type)
                for uid, code, field_type in depends_on_uid_code_type
                if uid not in uids]

    @classmethod
    def get_categories(cls, field: DocumentField) -> List[str]:
        return sorted(field.get_choice_values())

    @classmethod
    def train_document_field_detector_model(cls,
                                            log: ProcessLogger,
                                            field: DocumentField,
                                            train_data_project_ids: Optional[List],
                                            use_only_confirmed_field_values: bool = False,
                                            train_documents: Iterable[Document] = None) \
            -> Optional[ClassifierModel]:

        field_type_adapter = field.get_field_type()  # type: FieldType

        log.set_progress_steps_number(7)
        log.info('Training model for field #{0} ({1})...'.format(
            field.pk, field.code))

        # Classifier: values of dependencies -> value of this field
        # Field types supported: only choice fields
        if not isinstance(field_type_adapter, ChoiceField):
            raise ValueError(
                'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}'
                .format(field.code, field.uid, field_type_adapter.code))
        # Lets find good values of depends-on fields suitable for using as train data.

        train_data = cls.get_train_values(field, train_data_project_ids,
                                          train_documents,
                                          use_only_confirmed_field_values)

        if not train_data:
            raise RuntimeError(
                'Not enough train data for field {0} (#{1}). '
                'Need at least {2} approved or changed documents of type {3}.'.
                format(field.code, field.uid,
                       settings.ML_TRAIN_DATA_SET_GROUP_LEN,
                       field.document_type.code))

        depends_on_fields_types = cls.get_depends_on_uid_code_type(field)
        depends_on_fields_types = cls.remove_empty_fields(
            depends_on_fields_types, train_data)

        pipeline, feature_names_funcs = cls.build_pipeline(
            field, depends_on_fields_types)  # type: Pipeline, List[Callable]

        categories = cls.get_categories(field)
        category_names_to_indexes = {c: i for i, c in enumerate(categories)}

        log.step_progress()
        log.info(
            'Collecting feature rows from train and test documents in dict form...'
        )

        #  When tried to use sklearn shuffling something went wrong, leaving manual methods for a while.
        random.shuffle(train_data)

        #  TODO: use sklearn methods for splitting train/test data and shuffling

        test_size = 0.2

        train_feature_data = list()
        train_target_data = list()

        for doc_field_values in train_data:
            field_value = doc_field_values.get(field.uid)
            del doc_field_values[field.uid]

            field_value_idx = category_names_to_indexes.get(
                field_value) if field_value else None

            if field_value_idx is None:
                field_value_idx = len(categories)

            train_feature_data.append(doc_field_values)
            train_target_data.append(field_value_idx)

        is_index = math.floor(test_size * len(train_data))

        test_oos_feature_data = train_feature_data[:is_index]
        test_oos_target_data = train_target_data[:is_index]

        train_feature_data = train_feature_data[is_index:]
        train_target_data = train_target_data[is_index:]

        test_is_feature_data = train_feature_data  # [:is_index]
        test_is_target_data = train_target_data  # [:is_index]

        log.step_progress()
        log.info('Training the model...')
        model = pipeline.fit(train_feature_data, train_target_data)

        log.step_progress()

        log.info('Testing the model...')
        cm = ClassifierModel()
        cm.document_field = field

        predicted_oos = pipeline.predict(test_oos_feature_data)
        cm.classifier_accuracy_report_out_of_sample = classification_report(
            test_oos_target_data, predicted_oos, target_names=categories)

        predicted_is = pipeline.predict(test_is_feature_data)
        cm.classifier_accuracy_report_in_sample = classification_report(
            test_is_target_data, predicted_is, target_names=categories)

        log.step_progress()
        log.info('Saving ClassifierModel instance...')

        feature_names = []
        for f in feature_names_funcs:
            feature_names.extend(f())

        cm.set_trained_model_obj({
            'model': model,
            'categories': categories,
            'feature_names': feature_names
        })
        log.step_progress()
        log.info('Finished.')
        return cm

    @classmethod
    def get_train_values(
            cls, field: DocumentField, train_data_project_ids: Optional[List],
            train_documents: Iterable[Document],
            use_only_confirmed_field_values: bool) -> List[Dict[str, Any]]:
        repo = FieldBasedMLOnlyFieldDetectionStrategy.FIELD_REPOSITORY
        if train_documents:
            return list(
                repo.get_documents_field_values_by_uid(train_documents))

        if train_data_project_ids and not use_only_confirmed_field_values:
            return list(
                repo.get_project_documents_field_values_by_uid(
                    train_data_project_ids,
                    settings.ML_TRAIN_DATA_SET_GROUP_LEN, field.document_type))
        return list(cls.get_user_data(field, train_data_project_ids))

    @classmethod
    def maybe_detect_with_stop_words(
            cls, field: DocumentField,
            cached_fields: Dict[str,
                                Any]) -> Optional[List[DetectedFieldValue]]:
        if field.stop_words:
            depends_on_fields = list(field.depends_on_fields.all())
            depends_on_full_text = []

            if not any(cached_fields):
                return None

            for df in depends_on_fields:  # type: DocumentField
                field_type_adapter = FIELD_TYPE_REGISTRY[
                    df.type]  # type: FieldType
                v = field_type_adapter.merged_db_value_to_python(
                    cached_fields.get(df.uid))
                if v:
                    depends_on_full_text.append(str(v))

            detected_with_stop_words, detected_values = \
                detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text))
            if detected_with_stop_words:
                return detected_values or list()
        return None

    @classmethod
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:
        detected_values = cls.maybe_detect_with_stop_words(
            field, cached_fields)
        if detected_values is not None:
            return detected_values

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            doc_field_vals = cached_fields
            predicted = model.predict([doc_field_vals])

            target_index = predicted[0]

            target_name = categories[target_index] if 0 <= target_index < len(
                categories) else None

            dfv = DetectedFieldValue(field, target_name)
            return [dfv]

        except ClassifierModel.DoesNotExist as e:
            raise e