def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        document_type = doc.document_type  # type: DocumentType
        try:
            classifier_model = ClassifierModel.objects \
                .get(document_type=document_type, document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            field_type_adapter = FIELD_TYPES_REGISTRY[field.type]

            detected_values = list()  # type: List[DetectedFieldValue]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                detected_value = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    field_type_adapter=field_type_adapter,
                    document=doc,
                    field=field,
                    text_unit=text_unit)
                if detected_value is None:
                    continue
                detected_values.append(detected_value)
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    break
            return detected_values

        except ClassifierModel.DoesNotExist as e:
            log.info('Classifier model does not exist for field: {0}'.format(
                field.code))
            raise e
Exemple #2
0
def apply_simple_config(log: ProcessLogger,
                        document_field: DocumentField,
                        csv: bytes,
                        drop_previous_field_detectors: bool,
                        update_field_choice_values: bool,
                        csv_contains_regexps: bool = False):
    df = pd.read_csv(io.BytesIO(csv), dtype=str)
    if df.shape[0] < 1 or df.shape[1] < 1:
        raise ValueError('Config csv contains no data')
    row_num = df.shape[0]

    if update_field_choice_values:
        choices = df[
            df.columns[0]].dropna().drop_duplicates().sort_values().tolist()
        document_field.choices = '\n'.join(choices)
        document_field.save()

    log.info(
        'Creating {2} naive field detectors for document field {0} and document type {1}...'
        .format(document_field, document_field.document_type, df.shape[0]))
    log.set_progress_steps_number(int(row_num / 10) + 1)
    if drop_previous_field_detectors:
        DocumentFieldDetector.objects.filter(
            field=document_field,
            category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
    for index, row in df.iterrows():
        if len(row) == 0:
            continue

        includes = row.dropna()

        if not csv_contains_regexps:
            includes = [i.strip().replace(' ', '\s{1,100}') for i in includes]
        includes = [i for i in includes if i]

        if len(includes) == 1:
            log.info(
                'There are no search strings specified for detected value {0}'.
                format(row[0]))
            continue

        detector = DocumentFieldDetector()
        detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
        detector.field = document_field
        detector.regexps_pre_process_lower = True
        detector.detected_value = row[0]
        detector.include_regexps = '\n'.join(includes[1:])
        detector.save()
        if index % 10 == 0:
            log.step_progress()
    log.info('Done.')
def apply_simple_config(log: ProcessLogger, document_field: DocumentField,
                        document_type: DocumentType, csv: bytes,
                        drop_previous_field_detectors: bool,
                        update_field_choice_values: bool):
    df = pd.read_csv(io.BytesIO(csv), dtype=str)
    if df.shape[0] < 1 or df.shape[1] < 1:
        raise ValueError('Config csv contains no data')
    row_num = df.shape[0]

    if update_field_choice_values:
        choices = df[
            df.columns[0]].dropna().drop_duplicates().sort_values().tolist()
        document_field.choices = '\n'.join(choices)
        document_field.save()

    log.info(
        'Creating {2} naive field detectors for document field {0} and document type {1}...'
        .format(document_field, document_type, df.shape[0]))
    log.set_progress_steps_number(int(row_num / 10) + 1)
    if drop_previous_field_detectors:
        DocumentFieldDetector.objects.filter(
            field=document_field, document_type=document_type).delete()
    for index, row in df.iterrows():
        detector = DocumentFieldDetector()
        detector.document_type = document_type
        detector.field = document_field
        detector.regexps_pre_process_lower = True
        detector.detected_value = row[0]
        detector.include_regexps = '\n'.join(row.dropna()).lower()
        detector.save()
        if index % 10 == 0:
            log.step_progress()
    log.info('Done.')
Exemple #4
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            field_type_adapter = field.get_field_type()

            detected_values = list()  # type: List[DetectedFieldValue]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                detected_value = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    field_type_adapter=field_type_adapter,
                    document=doc,
                    field=field,
                    text_unit=text_unit)
                if detected_value is None:
                    continue
                detected_values.append(detected_value)
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    break
            return detected_values

        except ClassifierModel.DoesNotExist as e:
            log.info('Classifier model does not exist for field: {0}'.format(
                field.code))
            raise e
Exemple #5
0
    def train_document_field_detector_model(cls,
                                            log: ProcessLogger,
                                            document_type: DocumentType,
                                            field: DocumentField,
                                            train_data_project_ids: Optional[List],
                                            use_only_confirmed_field_values: bool = False) -> Optional[ClassifierModel]:
        log.info('Training model for field #{0} ({1})...'
                 .format(field.pk, field.code))

        if train_data_project_ids and not use_only_confirmed_field_values:
            train_data = DocumentFieldValue.objects \
                .filter(field_id=field.pk,
                        document__project_id__in=train_data_project_ids,
                        document__document_type_id=document_type.pk,
                        removed_by_user=False) \
                .values('created_by', 'text_unit__text', 'value', 'extraction_hint')
            train_data_sets = [list(train_data)]
        else:
            train_data_sets = get_train_data_sets(document_type, field, train_data_project_ids)

        if not train_data_sets:
            log.info('Not enough data to train model for document_type #{0} and field #{1}.'
                     .format(document_type.pk, field.pk))
            return None

        classifier_model = train_model(document_type, field, train_data_sets)
        log.info(
            'Finished training model for document_type #{0} and field #{1}.'.format(document_type.pk, field.pk))

        return classifier_model
    def train_document_field_detector_model(
        cls,
        log: ProcessLogger,
        field: DocumentField,
        train_data_project_ids: Optional[List],
        use_only_confirmed_field_values: bool = False
    ) -> Optional[ClassifierModel]:

        field_type_adapter = FIELD_TYPES_REGISTRY[
            field.type]  # type: FieldType

        log.set_progress_steps_number(7)
        log.info('Training model for field #{0} ({1})...'.format(
            field.pk, field.code))

        # Classifier: values of dependencies -> value of this field
        # Field types supported: only choice fields
        if not isinstance(field_type_adapter, ChoiceField):
            raise ValueError(
                'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}'
                .format(field.code, field.uid, field_type_adapter.code))
        # Lets find good values of depends-on fields suitable for using as train data.

        if train_data_project_ids and not use_only_confirmed_field_values:
            train_data = list(Document.objects \
                              .filter(project_id__in=train_data_project_ids) \
                              .values_list('field_values', flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
        else:
            train_data = list(cls.get_user_data(field, train_data_project_ids))

        if not train_data:
            raise RuntimeError(
                'Not enough train data for field {0} (#{1}). '
                'Need at least {2} approved or changed documents of type {3}.'.
                format(field.code, field.uid,
                       settings.ML_TRAIN_DATA_SET_GROUP_LEN,
                       field.document_type.code))

        depends_on_fields_types = cls.get_depends_on_uid_code_type(field)
        depends_on_fields_types = cls.remove_empty_fields(
            depends_on_fields_types, train_data)

        pipeline, feature_names_funcs = cls.build_pipeline(
            field, depends_on_fields_types)  # type: Pipeline, List[Callable]

        categories = sorted([c.strip() for c in field.choices.split('\n')])
        category_names_to_indexes = {c: i for i, c in enumerate(categories)}

        log.step_progress()
        log.info(
            'Collecting feature rows from train and test documents in dict form...'
        )

        #  When tried to use sklearn shuffling something went wrong, leaving manual methods for a while.
        random.shuffle(train_data)

        #  TODO: use sklearn methods for splitting train/test data and shuffling

        test_size = 0.2

        train_feature_data = list()
        train_target_data = list()

        for doc_field_values in train_data:
            field_value = doc_field_values.get(field.uid)
            del doc_field_values[field.uid]

            field_value_idx = category_names_to_indexes.get(
                field_value) if field_value else None
            if field_value_idx is None:
                field_value_idx = len(categories)

            train_feature_data.append(doc_field_values)
            train_target_data.append(field_value_idx)

        is_index = math.floor(test_size * len(train_data))

        test_oos_feature_data = train_feature_data[:is_index]
        test_oos_target_data = train_target_data[:is_index]

        train_feature_data = train_feature_data[is_index:]
        train_target_data = train_target_data[is_index:]

        test_is_feature_data = train_feature_data[:is_index]
        test_is_target_data = train_target_data[:is_index]

        log.step_progress()
        log.info('Training the model...')
        model = pipeline.fit(train_feature_data, train_target_data)

        log.step_progress()

        log.info('Testing the model...')
        cm = ClassifierModel()
        cm.document_field = field

        predicted_oos = pipeline.predict(test_oos_feature_data)
        cm.classifier_accuracy_report_out_of_sample = classification_report(
            test_oos_target_data, predicted_oos, target_names=categories)

        predicted_is = pipeline.predict(test_is_feature_data)
        cm.classifier_accuracy_report_in_sample = classification_report(
            test_is_target_data, predicted_is, target_names=categories)

        log.step_progress()
        log.info('Saving ClassifierModel instance...')

        feature_names = []
        for f in feature_names_funcs:
            feature_names.extend(f())

        cm.set_trained_model_obj({
            'model': model,
            'categories': categories,
            'feature_names': feature_names
        })
        log.step_progress()
        log.info('Finished.')
        return cm