def test_order_field_detection(self) -> None:
        init_field_type_registry()
        doc = self.setup_document()
        field = DocumentField()
        field.requires_text_annotations = False
        field.stop_words = None
        field.text_unit_type = 'sentences'

        text_unit_repo = MockTextUnitRepository()
        text_unit_repo.units = [TextUnit(), TextUnit()]
        text_unit_repo.units[0].text = "But those cushion's velvet lining"
        text_unit_repo.units[1].text = "She shall press! Ah! Nevermore..."
        for tu in text_unit_repo.units:
            tu.document = doc
            tu.unit_type = field.text_unit_type

        detect_repo = MockFieldDetectorRepository()

        detector = self.make_doc_field_detector()
        detect_repo.detectors = [detector]

        old_tu_repo = RegexpsOnlyFieldDetectionStrategy.text_unit_repo
        RegexpsOnlyFieldDetectionStrategy.text_unit_repo = text_unit_repo
        old_repo_detect = RegexpsOnlyFieldDetectionStrategy.field_detector_repo
        RegexpsOnlyFieldDetectionStrategy.field_detector_repo = detect_repo

        try:
            detected = RegexpsOnlyFieldDetectionStrategy.\
                detect_field_value(None, doc, field)
        finally:
            RegexpsOnlyFieldDetectionStrategy.text_unit_repo = old_tu_repo
            RegexpsOnlyFieldDetectionStrategy.field_detector_repo = old_repo_detect

        self.assertEqual(1, len(detected))
Ejemplo n.º 2
0
def apply_simple_config(log: ProcessLogger, document_field: DocumentField,
                        csv: bytes, drop_previous_field_detectors: bool,
                        update_field_choice_values: bool):
    df = pd.read_csv(io.BytesIO(csv), dtype=str)
    if df.shape[0] < 1 or df.shape[1] < 1:
        raise ValueError('Config csv contains no data')
    row_num = df.shape[0]

    if update_field_choice_values:
        choices = df[
            df.columns[0]].dropna().drop_duplicates().sort_values().tolist()
        document_field.choices = '\n'.join(choices)
        document_field.save()

    log.info(
        'Creating {2} naive field detectors for document field {0} and document type {1}...'
        .format(document_field, document_field.document_type, df.shape[0]))
    log.set_progress_steps_number(int(row_num / 10) + 1)
    if drop_previous_field_detectors:
        DocumentFieldDetector.objects.filter(
            field=document_field,
            category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
    for index, row in df.iterrows():
        detector = DocumentFieldDetector()
        detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
        detector.field = document_field
        detector.regexps_pre_process_lower = True
        detector.detected_value = row[0]
        detector.include_regexps = '\n'.join(row.dropna()).lower()
        detector.save()
        if index % 10 == 0:
            log.step_progress()
    log.info('Done.')
Ejemplo n.º 3
0
    def detect_field_values_for_python_coded_field(
            document: Document, field: DocumentField,
            sentence_text_units: List[TextUnit], do_not_write: bool) -> int:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        field_type_adapter = FIELD_TYPES_REGISTRY[
            field.type]  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]
        if python_coded_field.by_sentence:
            for text_unit in sentence_text_units:
                for value, location_start, location_end in python_coded_field.get_values(
                        text_unit.text) or []:
                    detected_values.append(
                        DetectedFieldValue(text_unit, value, None,
                                           location_start, location_end))
                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        return DetectFieldValues.save_detected_values(
                            document, field, field_type_adapter,
                            detected_values, do_not_write)
        else:
            for value, location_start, location_end in python_coded_field.get_values(
                    document.full_text) or []:
                text_unit = TextUnit.objects.filter(
                    document=document,
                    unit_type='sentence',
                    location_start__lte=location_start,
                    location_end__gte=location_start).first()  # type: TextUnit
                if not text_unit:
                    raise RuntimeError(
                        'Python coded field {0} detected a value in document {1} at '
                        'location [{2};{3}] but the start of location does not belong to any '
                        'text unit object in DB.\n'
                        'It can not be. Something is broken.'.format(
                            field.python_coded_field, document, location_start,
                            location_end))
                location_length = location_end - location_start
                location_start = location_start - text_unit.location_start
                location_end = location_start + location_length
                detected_values.append(
                    DetectedFieldValue(text_unit, value, None, location_start,
                                       location_end))
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    return DetectFieldValues.save_detected_values(
                        document, field, field_type_adapter, detected_values,
                        do_not_write)

        return DetectFieldValues.save_detected_values(document, field,
                                                      field_type_adapter,
                                                      detected_values,
                                                      do_not_write)
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        document_type = doc.document_type

        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for field_detector in field_detectors:
                if field_detector.matches(depends_on_value):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  depends_on_value)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, None, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        field_type_adapter = FIELD_TYPES_REGISTRY[
            field.type]  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]
        if python_coded_field.by_sentence:
            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                for value, location_start, location_end in python_coded_field.get_values(
                        text_unit.text) or []:
                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, None,
                                           location_start, location_end))
                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        return detected_values
        else:
            for value, location_start, location_end in python_coded_field.get_values(
                    doc.full_text) or []:
                text_unit = TextUnit.objects.filter(
                    document=doc,
                    unit_type='sentence',
                    location_start__lte=location_start,
                    location_end__gte=location_start).first()  # type: TextUnit
                if not text_unit:
                    raise RuntimeError(
                        'Python coded field {0} detected a value in document {1} at '
                        'location [{2};{3}] but the start of location does not belong to any '
                        'text unit object in DB.\n'
                        'It can not be. Something is broken.'.format(
                            field.python_coded_field, doc, location_start,
                            location_end))
                location_length = location_end - location_start
                location_start = location_start - text_unit.location_start
                location_end = location_start + location_length
                detected_values.append(
                    DetectedFieldValue(field, value, text_unit, None,
                                       location_start, location_end))
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    return detected_values

        return detected_values
Ejemplo n.º 6
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = RegexpsOnlyFieldDetectionStrategy.\
            text_unit_repo.get_doc_text_units(doc, field.text_unit_type)

        field_detectors = RegexpsOnlyFieldDetectionStrategy.\
            field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units:  # type: TextUnit

            for field_detector in detectors:
                matching_string = field_detector.matching_string(
                    text_unit.text, text_is_sentence=text_unit.is_sentence())
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Ejemplo n.º 7
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')
        document_type = doc.document_type

        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)

        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units.iterator():

            for field_detector in field_detectors:
                if field_detector.matches(text_unit.text):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  text_unit.text)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Ejemplo n.º 8
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')

        field_detectors = DocumentFieldDetector.objects.filter(field=field)

        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units.iterator():  # type: TextUnit

            for field_detector in field_detectors:
                matching_string = field_detector.matching_string(
                    text_unit.text, text_is_sentence=text_unit.is_sentence())
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Ejemplo n.º 9
0
    def detect_field_values_with_model(classifier_model, document: Document,
                                       field: DocumentField,
                                       sentence_text_units: List[TextUnit],
                                       do_not_write: bool) -> int:
        sklearn_model = classifier_model.get_trained_model_obj()
        field_type_adapter = FIELD_TYPES_REGISTRY[field.type]

        detected_values = list()  # type: List[DetectedFieldValue]
        for text_unit in sentence_text_units:
            value, hint_name = DetectFieldValues.predict_and_extract_value(
                sklearn_model=sklearn_model,
                field_type_adapter=field_type_adapter,
                document=document,
                field=field,
                text_unit=text_unit)
            if value is None:
                continue
            detected_values.append(
                DetectedFieldValue(text_unit, value, hint_name))
            if not (field_type_adapter.multi_value or field.is_choice_field()):
                break

        return DetectFieldValues.save_detected_values(document, field,
                                                      field_type_adapter,
                                                      detected_values,
                                                      do_not_write)
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        document_type = doc.document_type  # type: DocumentType
        try:
            classifier_model = ClassifierModel.objects \
                .get(document_type=document_type, document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            field_type_adapter = FIELD_TYPES_REGISTRY[field.type]

            detected_values = list()  # type: List[DetectedFieldValue]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                detected_value = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    field_type_adapter=field_type_adapter,
                    document=doc,
                    field=field,
                    text_unit=text_unit)
                if detected_value is None:
                    continue
                detected_values.append(detected_value)
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    break
            return detected_values

        except ClassifierModel.DoesNotExist as e:
            log.info('Classifier model does not exist for field: {0}'.format(
                field.code))
            raise e
Ejemplo n.º 11
0
    def __init__(self, text: str, field_type: str):
        self.document = Document()
        self.field = DocumentField()
        self.field.type = field_type

        self.text_unit = TextUnit()
        self.text_unit.document = self.document
        self.text_unit.textunittext = TextUnitText()
        self.text_unit.textunittext.text = text
        self.text_unit.location_start = 1001
        self.text_unit.location_end = self.text_unit.location_start + len(text)

        self.detector = DocumentFieldDetector()
        self.detector.regexps_pre_process_lower = True
        self.detector.include_regexps = 'at\\s{1,5}least\\s{1,5}(two|2).{1,15}unaffiliated.{1,15}lenders\n' + \
            '(two|2).{1,30}lenders.{1,200}(not.{1,50}affiliate|affiliate.{1,100}(one|1|single))'
        self.detector.definition_words = 'required lenders\nrequired revolving lenders\n' + \
                                    'required revolving credit lenders\nrequired term lenders\n' + \
                                    'requisite lenders\nrequisite revolving lenders\n' + \
                                    'required class lenders\nrequired ddtl lenders'
        self.detector.detected_value = 'AFFILIATED'
        self.detector.text_part = TextParts.FULL.value
        self.detector.extraction_hint = ValueExtractionHint.TAKE_FIRST

        self.matcher = DetectorFieldMatcher(self.detector)
def detect_field_value(log: ProcessLogger,
                       doc: Document,
                       field: DocumentField,
                       save: bool = False) -> Optional[FieldValueDTO]:
    field_repo = DocumentFieldRepository()
    strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
        field.value_detection_strategy] \
        if field.value_detection_strategy else STRATEGY_DISABLED

    doc_field_values = None

    depends_on_codes = set(field.get_depends_on_codes())

    if depends_on_codes:
        doc_field_values = field_repo.get_field_code_to_python_value(
            document_type_id=doc.document_type_id,
            doc_id=doc.pk,
            field_codes_only=depends_on_codes)

    dto = strategy.detect_field_value(log, doc, field, doc_field_values)
    if save and dto is not None:
        field_repo.update_field_value_with_dto(document=doc,
                                               field=field,
                                               field_value_dto=dto,
                                               user=None)
    return dto
Ejemplo n.º 13
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> FieldValueDTO:
        formula = field.formula

        if not formula:
            raise ValueError(
                f'No formula specified for field {field.code} (#{field.uid})')

        depends_on_field_codes = field.get_depends_on_codes() or set()

        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items()
            if c in depends_on_field_codes
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            log.debug(
                'detect_field_value: formula_based_field_detection, checking stop words, '
                + f'field {field.code}({field.pk}), document #{doc.pk}')
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field,
                                                                doc,
                                                                depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()
        else:
            log.debug('detect_field_value: formula_based_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')

        v = cls.calc_formula(field_code=field.code,
                             formula=formula,
                             depends_on_field_to_value=field_code_to_value,
                             convert_decimals_to_floats=field.
                             convert_decimals_to_floats_in_formula_args)
        typed_field = TypedField.by(field)

        # We don't accept formulas returning values of wrong type to avoid further confusion and
        # creating wrong formulas in future.
        # For example for multi-choice fields the formula should return a list and not a string
        # to ensure the admin understands that this value will replace the whole set/list of strings and not
        # just add one more string to the value.
        if typed_field.is_choice_field and typed_field.multi_value:
            if v and isinstance(v, str):
                # "outdated" formula is incorrect and returns string instead of
                # set / list, but we don't warn user: when he updates this formula
                # (or other detection method) he'll be forced to write code, returning
                # list or set.
                v = [v]

        if not typed_field.is_python_field_value_ok(v):
            raise ValueError(
                f'Formula of field {field.code} returned value not suitable for this field:\n{v}'
            )
        v = typed_field.field_value_python_to_json(v)
        return FieldValueDTO(field_value=v)
Ejemplo n.º 14
0
    def detect_values_in_document(self, text_units: List[TextUnitMock],
                                  detector: DocumentFieldDetector):
        init_field_type_registry()
        doc = self.setup_document(text_units)
        field = DocumentField()
        field.requires_text_annotations = False
        field.stop_words = None
        field.text_unit_type = 'sentences'
        field.type = 'multi_choice'
        field.allow_values_not_specified_in_choices = True

        text_unit_repo = MockTextUnitRepository()
        text_unit_repo.units = text_units
        for tu in text_unit_repo.units:
            tu.document = doc
            tu.unit_type = field.text_unit_type

        detect_repo = MockFieldDetectorRepository()
        detect_repo.detectors = [detector]

        old_tu_repo = RegexpsOnlyFieldDetectionStrategy.text_unit_repo
        RegexpsOnlyFieldDetectionStrategy.text_unit_repo = text_unit_repo
        old_repo_detect = RegexpsOnlyFieldDetectionStrategy.field_detector_repo
        RegexpsOnlyFieldDetectionStrategy.field_detector_repo = detect_repo

        try:
            detected = RegexpsOnlyFieldDetectionStrategy. \
                detect_field_value(None, doc, field, {})
        finally:
            RegexpsOnlyFieldDetectionStrategy.text_unit_repo = old_tu_repo
            RegexpsOnlyFieldDetectionStrategy.field_detector_repo = old_repo_detect
        return detected
Ejemplo n.º 15
0
 def _get_invalid_choices(self, saved_field: DocumentField) -> set:
     # old_choices = set()
     # if not saved_field.allow_values_not_specified_in_choices and \
     #         not self.object.allow_values_not_specified_in_choices:
     old_choices = set(saved_field.get_choice_values())
     for choice_value in self.object.get_choice_values():
         if choice_value in old_choices:
             old_choices.remove(choice_value)
     return old_choices
Ejemplo n.º 16
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_fields = field.get_depends_on_codes()
        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items() if c in depends_on_fields
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_value \
                = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
            if detected_with_stop_words:
                return FieldValueDTO(field_value=detected_value)

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]
        typed_field = TypedField.by(field)  # type: TypedField

        values = list()  # type: List

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for detector_field_matcher in detectors:  # type: DetectorFieldMatcher
                matching_piece = detector_field_matcher.matching_string(
                    depends_on_value, text_is_sentence=False)
                if matching_piece is not None:
                    matching_string = matching_piece[0]
                    value = detector_field_matcher.get_validated_detected_value(
                        field)
                    if typed_field.requires_value:
                        hint_name = detector_field_matcher.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = typed_field \
                            .get_or_extract_value(doc,
                                                  value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    value = typed_field.annotation_value_python_to_json(value)
                    if not isinstance(typed_field, MultiValueField):
                        return FieldValueDTO(field_value=value)
                    else:
                        values.append(value)

        if isinstance(typed_field, MultiValueField):
            return FieldValueDTO(
                field_value=typed_field.
                build_json_field_value_from_json_ant_values(values))
        else:
            return None
Ejemplo n.º 17
0
    def save_detected_values(document: Document, field: DocumentField,
                             field_type_adapter: FieldType,
                             detected_values: List[DetectedFieldValue],
                             do_not_write: bool):
        if len(detected_values) == 0:
            return 0

        try:
            if field.is_choice_field() and not field_type_adapter.multi_value:
                values_order = field.get_choice_values()
                for choice_value in values_order:
                    for dv in detected_values:
                        if choice_value == dv.value:
                            if not do_not_write:
                                field_type_adapter.save_value(
                                    document,
                                    field,
                                    dv.get_annotation_start(),
                                    dv.get_annotation_end(),
                                    dv.get_annotation_text(),
                                    dv.text_unit,
                                    dv.value,
                                    user=None,
                                    allow_overwriting_user_data=False,
                                    extraction_hint=dv.hint_name)
                            return 1
            else:
                for dv in detected_values:
                    if not do_not_write:
                        field_type_adapter.save_value(
                            document,
                            field,
                            dv.get_annotation_start(),
                            dv.get_annotation_end(),
                            dv.get_annotation_text(),
                            dv.text_unit,
                            dv.value,
                            user=None,
                            allow_overwriting_user_data=False,
                            extraction_hint=dv.hint_name)
                return len(detected_values)
        finally:
            document.cache_field_values()
Ejemplo n.º 18
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        fields_to_values = {field: FIELD_TYPES_REGISTRY[field.type].example_json_value(field)
                            for field in depends_on_fields}

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(python_coded_field_code)
            if not python_coded_field:
                self.add_error('python_coded_field', 'Unknown Python-coded field: {0}'.format(python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error('type', 'Python-coded field {0} is of type {1} but {2} is specified'
                                           ' as the field type'.format(python_coded_field.title,
                                                                       python_coded_field.type,
                                                                       type_code))

        if not formula or not formula.strip() or not type_code:
            return

        try:
            DocumentField.calc_formula(field_code, type_code, formula, fields_to_values)
        except DocumentFieldFormulaError as ex:
            base_error_class = type(ex.base_error).__name__
            base_error_msg = str(ex.base_error)
            lines = list()
            lines.append("Error caught while trying to execute formula on example values:")
            for field_name in ex.field_values:
                lines.append('{0}={1}'.format(field_name, ex.field_values[field_name]))
            lines.append("{0}. {1} in formula of field '{2}' at line {3}".format(base_error_class, base_error_msg,
                                                                                 ex.field_code, ex.line_number))
            self.add_error('formula', lines)
        except Exception:
            trace = traceback.format_exc()
            raise forms.ValidationError(
                'Tried to eval formula on example values:\n{0}\nGot error:\n{1}'.format(
                    str(fields_to_values), trace))

        return self.cleaned_data
Ejemplo n.º 19
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            field_type_adapter = field.get_field_type()

            detected_values = list()  # type: List[DetectedFieldValue]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                detected_value = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    field_type_adapter=field_type_adapter,
                    document=doc,
                    field=field,
                    text_unit=text_unit)
                if detected_value is None:
                    continue
                detected_values.append(detected_value)
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    break
            return detected_values

        except ClassifierModel.DoesNotExist as e:
            log.info('Classifier model does not exist for field: {0}'.format(
                field.code))
            raise e
Ejemplo n.º 20
0
    def clean(self):
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        if not formula or not formula.strip() or not type_code:
            return
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        fields_to_values = {
            field: FIELD_TYPES_REGISTRY[field.type].example_json_value(field)
            for field in depends_on_fields
        }

        try:
            DocumentField.calc_formula(type_code, formula, fields_to_values)
        except Exception as ex:
            trace = traceback.format_exc()
            raise forms.ValidationError(
                'Tried to eval formula on example values:\n{0}\nGot error:\n{1}'
                .format(str(fields_to_values), trace))

        return self.cleaned_data
def save_detected_values(document: Document, field: DocumentField,
                         detected_values: List[DetectedFieldValue]):
    if len(detected_values) == 0:
        return 0

    field_type_adapter = FIELD_TYPES_REGISTRY[field.type]  # type: FieldType

    if field.is_choice_field() and not field_type_adapter.multi_value:
        values_order = field.get_choice_values()
        for choice_value in values_order:
            for dv in detected_values:
                if choice_value == dv.value:
                    field_type_adapter.save_value(
                        document,
                        field,
                        dv.get_annotation_start(),
                        dv.get_annotation_end(),
                        dv.get_annotation_text(),
                        dv.text_unit,
                        dv.value,
                        user=dv.user,
                        allow_overwriting_user_data=dv.user is not None,
                        extraction_hint=dv.hint_name)
                    return 1
    else:
        for dv in detected_values:
            field_type_adapter.save_value(document,
                                          field,
                                          dv.get_annotation_start(),
                                          dv.get_annotation_end(),
                                          dv.get_annotation_text(),
                                          dv.text_unit,
                                          dv.value,
                                          user=dv.user,
                                          allow_overwriting_user_data=dv.user
                                          is not None,
                                          extraction_hint=dv.hint_name)
        return len(detected_values)
Ejemplo n.º 22
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        if not formula or not formula.strip() or not type_code:
            return
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        fields_to_values = {
            field: FIELD_TYPES_REGISTRY[field.type].example_json_value(field)
            for field in depends_on_fields
        }

        try:
            DocumentField.calc_formula(field_code, type_code, formula,
                                       fields_to_values)
        except DocumentFieldFormulaError as ex:
            base_error_class = type(ex.base_error).__name__
            base_error_msg = str(ex.base_error)
            lines = list()
            lines.append(
                "Error caught while trying to execute formula on example values:"
            )
            for field_name in ex.field_values:
                lines.append('{0}={1}'.format(field_name,
                                              ex.field_values[field_name]))
            lines.append(
                "{0}. {1} in formula of field '{2}' at line {3}".format(
                    base_error_class, base_error_msg, ex.field_code,
                    ex.line_number))
            self.add_error('formula', lines)
        except Exception:
            trace = traceback.format_exc()
            raise forms.ValidationError(
                'Tried to eval formula on example values:\n{0}\nGot error:\n{1}'
                .format(str(fields_to_values), trace))

        return self.cleaned_data
def detect_with_stop_words_by_field_and_full_text(field: DocumentField, full_text: str) -> Tuple[bool, Optional[List]]:
    if field.requires_text_annotations:
        return False, None
    stop_words = compile_stop_words(field.stop_words)
    if not stop_words:
        return False, None
    field_type_adapter = field.get_field_type()  # type: FieldType
    detected, possible_value = detect_value_with_stop_words(stop_words, full_text)
    if not detected:
        return False, None
    if possible_value is None:
        return True, None
    else:
        possible_value = field_type_adapter.extract_from_possible_value_text(field, possible_value)
        return True, [DetectedFieldValue(field, possible_value)]
    def make_doc_field(**kwargs) -> DocumentField:
        doc_field_attributes = {
            'requires_text_annotations': kwargs.get('requires_text_annotations', False),
            'stop_words': kwargs.get('stop_words'),
            'text_unit_type': kwargs.get('text_unit_type', 'sentence'),
            'type': kwargs.get('type', 'multi_choice'),
            'choices': kwargs.get('choices', 'brown fox\nbrown box\nfrown fox'),
            'allow_values_not_specified_in_choices': kwargs.get('choices', True),
            'detect_limit_unit': kwargs.get('choices', 'UNIT'),
            'detect_limit_count': kwargs.get('choices', 0)
        }

        for k, v in doc_field_attributes.items():
            if k not in kwargs:
                kwargs[k] = v
        return DocumentField(**kwargs)
Ejemplo n.º 25
0
    def detect_field_values_with_regexps(document: Document,
                                         field: DocumentField,
                                         sentence_text_units: List[TextUnit],
                                         do_not_write: bool) -> int:
        document_type = document.document_type
        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in sentence_text_units:

            for field_detector in field_detectors:
                if field_detector.matches(text_unit.text):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(document,
                                                  field, value,
                                                  hint_name,
                                                  text_unit.text)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(text_unit, value, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

        return DetectFieldValues.save_detected_values(document, field,
                                                      field_type_adapter,
                                                      detected_values,
                                                      do_not_write)
Ejemplo n.º 26
0
    def manual_test_columns(self):
        user_fields = [DocumentField(), DocumentField()]
        user_fields[0].code = 'k_one'
        user_fields[0].type = 'int'
        user_fields[1].code = 'k_ten'
        user_fields[1].type = 'multi_choice'

        repo = DocumentFieldRepositoryMock()
        repo.fields = user_fields

        doc_type = DocumentType.objects.get(code='k_fields_depend')
        f_count = calculate_doctype_cache_columns(doc_type, [], repo)
        self.assertEqual(35, f_count)

        # old type was 'bigint'
        f_new = DocumentField()
        f_new.code = 'k_ten'
        f_new.type = 'linked_documents'

        f_count = calculate_doctype_cache_columns(doc_type, [f_new], repo)
        self.assertEqual(36, f_count)
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]
        if python_coded_field.detect_per_text_unit:
            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                for value, location_start, location_end \
                        in python_coded_field.get_values(log, field, doc, text_unit.text) or []:
                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, None,
                                           location_start, location_end))
                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        return detected_values
        else:
            for value, location_start, location_end \
                    in python_coded_field.get_values(log, field, doc, doc.full_text) or []:
                if field.requires_text_annotations and (
                        location_start is None or location_end is None):
                    raise RuntimeError(
                        'Python coded field {0} detected a value in document {1} at '
                        'undefined location but the field requires text annotation (and location).\n'
                        'This should not happen. Something is broken.'.format(
                            field.python_coded_field, doc))
                if location_start is not None and location_end is not None:
                    text_unit = TextUnit.objects.filter(
                        document=doc,
                        unit_type=field.text_unit_type,
                        location_start__lte=location_start,
                        location_end__gte=location_start).first(
                        )  # type: TextUnit
                    if not text_unit:
                        raise RuntimeError(
                            'Python coded field {0} detected a value in document {1} at '
                            'location [{2};{3}] but the start of location does not belong to any '
                            'text unit object in DB.\n'
                            'This should not happen. Something is broken.'.
                            format(field.python_coded_field, doc,
                                   location_start, location_end))
                    location_length = location_end - location_start
                    location_start = location_start - text_unit.location_start
                    location_end = location_start + location_length
                else:
                    text_unit = None
                    location_start = None
                    location_end = None
                detected_values.append(
                    DetectedFieldValue(field, value, text_unit, None,
                                       location_start, location_end))
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    return detected_values

        return detected_values
Ejemplo n.º 28
0
class LoggerMock(ProcessLogger):
    def info(self, message: str):
        print(message)

    def error(self,
              message: str,
              field_code: str = None,
              exc_info: Exception = None):
        if field_code:
            message = f'{field_code}: {message or "error"}'
        if exc_info:
            message += f'\nException: {exc_info}'
        print(message)


doc_field = DocumentField()
logger = LoggerMock()


def setup_mock():
    doc_field.uid = 'ABCDEF'
    doc_field.code = 'client'

    csv_text = """
    ,value,pattern
    0,"Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby (Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); Lobo, Rebecca (Charlotte))","\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b"
    1,"Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))","family\s{1,5}name\s{1,5}\(173437\)"
    2,"Financial Services & Co. (015607) (Spelling, Tori (Chicago); Priestley, Jason (Dallas); Perry, Luke (New York); Doherty, Shannon (Chicago); Garth, Jenny (Chicago))","\bfinancial\s{1,5}services\s{1,5}.{1,5}(015607)\b"
    3,"Food Wholsale, Inc. (056230) (Jenner, Bruce (Chicago))","\bfood\s{1,5}wholsale,(056230)\b"
    4,"All Eyes Communications (018951) (Moore, Michael (New York); Tarantino, Quentin (San Francisco); Lee, Spike (New York); Levinson, Barry (Charlotte))","\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b"
    5,"Joe Smith Archives, LLC d/b/a Foxtrot (085292) (Flay, Bobby (New York))","\bfoxtrot\s{1,5}(085292)\b
Ejemplo n.º 29
0
    def process(self, **kwargs):

        with transaction.atomic():

            csv_log = list()  # type: List[Tuple[str, str, str]]
            for document_type in DocumentType.objects.all():  # type: DocumentType
                changed_field_codes = dict()  # type: Dict[str, str]
                field_code_use_counts = dict()  # type: Dict[str, int]

                for code in DocumentField.objects \
                        .filter(document_type=document_type) \
                        .order_by('order', 'code') \
                        .values_list('code', flat=True):
                    field_code_use_counts[code] = 1
                    m = self.RE_FIELD_CODE_NUM.fullmatch(code)
                    if m:
                        base = m.group(1)
                        num = int(m.group(2))
                        old_num = field_code_use_counts.get(base) or 0
                        field_code_use_counts[base] = max(old_num, num)

                for field in DocumentField.objects \
                        .filter(document_type=document_type) \
                        .order_by('order', 'code'):  # type: DocumentField
                    field_code_escaped = escape_column_name(field.code)[:DOCUMENT_FIELD_CODE_MAX_LEN]

                    if field.code == field_code_escaped:
                        field_code_use_counts[field.code] = (field_code_use_counts.get(field.code) or 0) + 1
                        long_code = DocumentField.get_long_code(field, document_type)
                        if field.long_code != long_code:
                            self.log_info('Updating field long code {0} to {1}'
                                          .format(field.long_code, long_code))
                            field.long_code = long_code
                            field.save(update_fields={'long_code'})
                    else:
                        field_code_use_count = field_code_use_counts.get(field_code_escaped)
                        if field_code_use_count is not None:
                            field_code_use_counts[field_code_escaped] = field_code_use_count + 1
                            counter_str = str(field_code_use_count)

                            # make next repeated column name to be column1, column2, ...
                            # make it fitting into N chars by cutting the field code on the required
                            # number of chars to fit the num
                            field_code_escaped = field_code_escaped[:DOCUMENT_FIELD_CODE_MAX_LEN - len(counter_str) - 1] \
                                                 + '_' + counter_str
                        else:
                            field_code_use_counts[field_code_escaped] \
                                = (field_code_use_counts.get(field_code_escaped) or 0) + 1

                        self.log_info('Updating field {0}.{1} to {2}'
                                      .format(document_type.code, field.code, field_code_escaped))
                        changed_field_codes[field.code] = field_code_escaped
                        csv_log.append((document_type.code, field.code, field_code_escaped))
                        field.code = field_code_escaped

                        field.long_code = DocumentField.get_long_code(field, document_type)

                        field.save(update_fields={'code', 'long_code'})

                    hide_until_js = jiphy.to.javascript(field.hide_until_python) if field.hide_until_python else ''
                    if hide_until_js != field.hide_until_js:
                        field.hide_until_js = hide_until_js
                        self.log_info('Updating hide_until_js for field {0}.{1}'
                                      .format(document_type.code, field.code))
                        field.save(update_fields={'hide_until_js'})

                if len(changed_field_codes) > 0 and document_type.field_code_aliases:
                    updated_aliases = {k: changed_field_codes.get(v) or v
                                       for k, v in document_type.field_code_aliases.items()}
                    self.log_info('Updating field code aliases of document type {0}"\n{1}'
                                  .format(document_type.code, updated_aliases))
                    document_type.field_code_aliases = updated_aliases
                    document_type.save(update_fields={'field_code_aliases'})

        output = io.StringIO()
        writer = csv.writer(output)
        writer.writerow(('Document Type', 'Old Field Code', 'New Field Code'))
        for r in csv_log:
            writer.writerow(r)
        self.log_info('\n\n\n------------------\n'
                      'Changed fields csv:\n' + output.getvalue() + '\n------------------')
Ejemplo n.º 30
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for field_detector in field_detectors:  # type: DocumentFieldDetector
                matching_string = field_detector.matching_string(
                    depends_on_value, text_is_sentence=False)
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, None, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values