Ejemplo n.º 1
0
def apply_simple_config(log: ProcessLogger, document_field: DocumentField,
                        csv: bytes, drop_previous_field_detectors: bool,
                        update_field_choice_values: bool):
    df = pd.read_csv(io.BytesIO(csv), dtype=str)
    if df.shape[0] < 1 or df.shape[1] < 1:
        raise ValueError('Config csv contains no data')
    row_num = df.shape[0]

    if update_field_choice_values:
        choices = df[
            df.columns[0]].dropna().drop_duplicates().sort_values().tolist()
        document_field.choices = '\n'.join(choices)
        document_field.save()

    log.info(
        'Creating {2} naive field detectors for document field {0} and document type {1}...'
        .format(document_field, document_field.document_type, df.shape[0]))
    log.set_progress_steps_number(int(row_num / 10) + 1)
    if drop_previous_field_detectors:
        DocumentFieldDetector.objects.filter(
            field=document_field,
            category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
    for index, row in df.iterrows():
        detector = DocumentFieldDetector()
        detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
        detector.field = document_field
        detector.regexps_pre_process_lower = True
        detector.detected_value = row[0]
        detector.include_regexps = '\n'.join(row.dropna()).lower()
        detector.save()
        if index % 10 == 0:
            log.step_progress()
    log.info('Done.')
 def make_doc_field_detector(exclude_regexps: Optional[str] = None,
                             include_regexps: Optional[str] = None,
                             detected_value: Optional[str] = None,
                             regexps_pre_process_lower: bool = True,
                             definition_words: Optional[str] = None) -> DocumentFieldDetector:
     detector = DocumentFieldDetector()
     detector.exclude_regexps = exclude_regexps if exclude_regexps is not None else 'cushion'
     detector.include_regexps = include_regexps if include_regexps is not None else r'(?<=\D{3,3}\s\D{5,5}\s)\D+'
     if detected_value is not None:
         detector.detected_value = detected_value
     detector.extraction_hint = 'TAKE_FIRST'  # 'detected'
     detector.text_part = 'INSIDE_REGEXP'
     detector.regexps_pre_process_lower = regexps_pre_process_lower
     detector.definition_words = definition_words
     return detector
Ejemplo n.º 3
0
    def save_detector_settings(
            self, detectors_by_value: Dict[str, List[str]]) -> None:
        # save [all pattern: value] records into DocumentFieldMultilineRegexDetector
        if self.save_in_csv_format:
            self.save_detector_settings_csv(detectors_by_value)
            return

        # save patterns as one or more DocumentFieldDetector records
        # but before (optionally) delete old settings
        if self.drop_previous_field_detectors:
            DocumentFieldDetector.objects.filter(
                field=self.document_field,
                category=self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
        for field_val in detectors_by_value:
            include_reg_values = detectors_by_value[field_val]

            detector = DocumentFieldDetector()
            detector.category = self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
            detector.field = self.document_field
            detector.regexps_pre_process_lower = True
            detector.detected_value = field_val
            detector.include_regexps = '\n'.join(include_reg_values)
            detector.save()