Beispiel #1
0
def apply_simple_config(log: ProcessLogger, document_field: DocumentField,
                        csv: bytes, drop_previous_field_detectors: bool,
                        update_field_choice_values: bool):
    df = pd.read_csv(io.BytesIO(csv), dtype=str)
    if df.shape[0] < 1 or df.shape[1] < 1:
        raise ValueError('Config csv contains no data')
    row_num = df.shape[0]

    if update_field_choice_values:
        choices = df[
            df.columns[0]].dropna().drop_duplicates().sort_values().tolist()
        document_field.choices = '\n'.join(choices)
        document_field.save()

    log.info(
        'Creating {2} naive field detectors for document field {0} and document type {1}...'
        .format(document_field, document_field.document_type, df.shape[0]))
    log.set_progress_steps_number(int(row_num / 10) + 1)
    if drop_previous_field_detectors:
        DocumentFieldDetector.objects.filter(
            field=document_field,
            category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
    for index, row in df.iterrows():
        detector = DocumentFieldDetector()
        detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
        detector.field = document_field
        detector.regexps_pre_process_lower = True
        detector.detected_value = row[0]
        detector.include_regexps = '\n'.join(row.dropna()).lower()
        detector.save()
        if index % 10 == 0:
            log.step_progress()
    log.info('Done.')
Beispiel #2
0
    def save_detector_settings(
            self, detectors_by_value: Dict[str, List[str]]) -> None:
        # save [all pattern: value] records into DocumentFieldMultilineRegexDetector
        if self.save_in_csv_format:
            self.save_detector_settings_csv(detectors_by_value)
            return

        # save patterns as one or more DocumentFieldDetector records
        # but before (optionally) delete old settings
        if self.drop_previous_field_detectors:
            DocumentFieldDetector.objects.filter(
                field=self.document_field,
                category=self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
        for field_val in detectors_by_value:
            include_reg_values = detectors_by_value[field_val]

            detector = DocumentFieldDetector()
            detector.category = self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
            detector.field = self.document_field
            detector.regexps_pre_process_lower = True
            detector.detected_value = field_val
            detector.include_regexps = '\n'.join(include_reg_values)
            detector.save()