Ejemplo n.º 1
0
def reindex_status_name_for_all_documents_in_system(task: ExtendedTask, status_pk: Any) -> None:
    from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info('Document caching to raw tables is disabled in Commons / App Vars')
        return
    _reindex_document_ids_packets(task,
                                  get_all_doc_ids_not_planned_to_index_by_status_pk(status_pk, DOC_NUM_PER_SUB_TASK))
Ejemplo n.º 2
0
    def check_document_field_values(task: ExtendedTask, dfv_ids: Set, delete_broken: bool = False):

        for dfv in DocumentFieldValue.objects \
                .filter(pk__in=dfv_ids) \
                .select_related('field'):  # type: DocumentFieldValue
            try:
                temp_value = dfv.python_value
                if temp_value is not None:
                    field = dfv.field
                    if field.is_choice_field() and not field.is_choice_value(temp_value):
                        raise ValueError('Field value {0} is not in list of its choice values:\n{1}'
                                         .format(temp_value, field.choices))
            except:
                if delete_broken:
                    dfv.delete()
                    msg = render_error('Found broken document field value.\n'
                                       'Document field value id: {0}\n'
                                       'DB value: {1}\n'
                                       'The document field value has been deleted.\n'
                                       .format(dfv.pk, dfv.value))
                else:
                    content_type = ContentType.objects.get_for_model(DocumentFieldValue)
                    dfv_admin_url = reverse("admin:%s_%s_change" %
                                            (content_type.app_label, content_type.model), args=(dfv.pk,))
                    msg = render_error('Found broken document field value.\n'
                                       'Document field value id: {0}\n'
                                       'DB value: {1}\n'
                                       'Admin URL: {2}\n'.format(dfv.pk, dfv.value, dfv_admin_url))
                task.log_info(msg)
Ejemplo n.º 3
0
    def detect_and_process_lease_document(task: ExtendedTask, document_id: int, no_detect: bool):
        doc = Document.objects.get(pk=document_id)
        doc_text = doc.full_text

        try:
            lease_doc = LeaseDocument.objects.get(pk=document_id)
        except:
            lease_doc = None

        if lease_doc or no_detect or ProcessLeaseDocuments.lease_doc_detector.is_lease_document(
                doc_text):
            task.log_info('{2} lease document: #{0}. {1}'
                               .format(document_id,
                                       doc.name,
                                       'Processing' if no_detect else 'Detected'))
            if not lease_doc:
                lease_doc = LeaseDocument(document_ptr=doc)
                lease_doc.__dict__.update(doc.__dict__)

            ProcessLeaseDocuments.process_landlord_tenant(lease_doc, doc_text)
            ProcessLeaseDocuments.process_fields(lease_doc, doc_text, task)

            lease_doc.save()

        else:
            task.log_info('Not a lease document: #{0}. {1}'.format(document_id, doc.name))
Ejemplo n.º 4
0
    def join_field_detector_model_tests(task: ExtendedTask, field_uid,
                                        classifier_model_id):
        results = list(Task.objects \
                       .filter(main_task_id=task.request.parent_id,
                               name=TrainAndTest.test_field_detector_model.name)
                       .values_list('result', flat=True))

        test_text_units_number = 0
        match_number = 0
        test_doc_number = 0

        matches_per_value = dict()
        total_per_value = dict()

        for res in results:
            actual_field_value = res.get('actual_field_value')

            if actual_field_value:
                if actual_field_value not in total_per_value:
                    total_per_value[actual_field_value] = 0
                total_per_value[actual_field_value] += 1

            test_doc_number += 1
            test_text_units_number += (res.get('text_units_number') or 0)
            if res.get('value_matches_expected'):
                match_number += 1
                if actual_field_value:
                    if actual_field_value not in matches_per_value:
                        matches_per_value[actual_field_value] = 0
                    matches_per_value[actual_field_value] += 1

        accuracy = match_number / test_doc_number

        if classifier_model_id:
            classifier_model = ClassifierModel.objects.get(
                pk=classifier_model_id)
            classifier_model.field_detection_accuracy = accuracy
            classifier_model.save()

        field = DocumentField.objects.get(pk=field_uid)

        task.log_info(
            'Testing finished.\n'
            'Document type: {0}.\n'
            'Field: {1}.\n'
            'Text unit type: {2}.\n'
            'Test documents number: {3}.\n'
            'Test text units number: {4}.\n'
            'Accuracy: {5}.\n'.format(field.document_type.code, field.code,
                                      field.text_unit_type, test_doc_number,
                                      test_text_units_number, accuracy))

        if field.is_choice_field():
            accuracy_per_value = {
                actual_field_value:
                (matches_per_value.get(actual_field_value) or 0) / total
                for actual_field_value, total in total_per_value.items()
            }
            task.log_info('Accuracy per value:\n{0}'.format(
                json.dumps(accuracy_per_value, sort_keys=True, indent=2)))
def auto_reindex_not_tracked(task: ExtendedTask,
                             document_type_code: str = None,
                             force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        return
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    task_model = task.task

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force)
        if reindex_needed:
            force_fmt = ', forced' if force else ''
            task.log_info(
                f'Re-index from auto_reindex_not_tracked, {task.name}, '
                f'for {document_type}{force_fmt}')
            call_task_func(manual_reindex, (document_type.code, False),
                           task_model.user_id)
        else:
            if there_are_non_indexed_docs_not_planned_to_index(document_type, log) \
                    and not any_other_reindex_task(task.request.id, document_type.code).exists():
                task.log_info(
                    f'auto_reindex_not_tracked({document_type.code}): '
                    f'there_are_non_indexed_docs_not_planned_to_index')
                call_task_func(manual_reindex, (document_type.code, False),
                               task_model.user_id)
Ejemplo n.º 6
0
    def load_doc(task: ExtendedTask, document: Document, document_fields: Dict, run_detect_field_values: bool,
                 filed_owners: dict = None):
        filed_owners = filed_owners if filed_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, document_fields, filed_owners)
        log = CeleryTaskLogger(task)

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            if not new_document:
                DocumentFieldValue.objects \
                    .filter(document=document,
                            removed_by_user=False,
                            created_by__isnull=True,
                            modified_by__isnull=True) \
                    .delete()

            for field, values in fields_to_values.items():
                field_detection.save_detected_values(document, field, values)

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log, document, True)
            else:
                dfvs = field_detection.detect_and_cache_field_values_for_document(log, document, False)
                field_value_cache.cache_field_values(document, dfvs, save=True)

        task.log_info('Loaded {0} field values for document #{1} ({2})'
                      .format(len(fields_to_values), document.pk, document.name))
def update_status_name_for_documents(task: ExtendedTask,
                                     status_pk: Any) -> None:
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    _reindex_document_ids_packets(
        task, get_all_doc_ids_not_planned_to_index_by_status_pk(status_pk, 20))
Ejemplo n.º 8
0
def manual_reindex(task: ExtendedTask,
                   document_type_code: str = None,
                   force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    adapt_tables_and_reindex(task, document_type_code, force, True)
Ejemplo n.º 9
0
    def run_train_model_for_field(task: ExtendedTask, document_type_uid,
                                  field_uid, no_field_sentences,
                                  trigger_re_detecting_field_values):
        document_type = DocumentType.objects.get(pk=document_type_uid)
        field = DocumentField.objects.get(pk=field_uid)

        task.log_info('Training model for field #{0} ({1})...'.format(
            field_uid, field.code))

        document_type_field = DocumentTypeField.objects.get(
            document_type=document_type, document_field=field)

        if document_type_field.use_regexp_always:
            task.log_info(
                'Regexp will used for document_type #{0} and field #{1}.'.
                format(document_type_uid, field_uid))
            return None

        train_data = TrainDocumentFieldDetectorModel.get_train_data(
            document_type, field,
            document_type_field.trained_after_documents_number)
        if train_data:
            train_data = TrainDirtyDocumentFieldDetectorModel.get_train_data_generator(
                train_data)
            total_samples = TrainDirtyDocumentFieldDetectorModel.train_model(
                train_data, document_type, field, no_field_sentences)
            task.log_info(
                'Finished training model for document_type #{0} and field #{1}. '
                'Total number of samples: {2}'.format(document_type_uid,
                                                      field_uid,
                                                      total_samples))
        else:
            task.log_info(
                'Not enough data to train model for document_type #{0} and field #{1}.'
                .format(document_type_uid, field_uid))
            return None

        if trigger_re_detecting_field_values:
            detect_field_values_for_document_args = []

            documents = Document.objects.active()\
                .filter(document_type=document_type)\
                .values_list('pk', 'name', 'source')
            source_data = []

            for document_id, name, source in documents:
                if source:
                    source_data.append('{0}/{1}'.format(source, name))
                else:
                    source_data.append(name)

                detect_field_values_for_document_args.append(
                    (document_id, False, field_uid))
            task.run_sub_tasks(
                'Detect Values of Field {0} for Each Document'.format(
                    field.code),
                DetectFieldValues.detect_field_values_for_document,
                detect_field_values_for_document_args, source_data)
    def load_field_values(task: ExtendedTask, document: Document,
                          document_fields: dict, field_owners: dict) -> Dict:
        document_type = document.document_type
        fields_to_values = {}

        if not document_type:
            return fields_to_values

        field_code_aliases = document_type.field_code_aliases

        field_codes_to_fields = {
            f.code.lower(): f
            for f in document_type.fields.all()
        }

        if field_code_aliases:
            field_codes_to_fields.update({
                field_alias.lower():
                field_codes_to_fields.get(field_code.lower())
                for field_alias, field_code in field_code_aliases.items()
                if field_alias and field_code
            })

        def _maybe_add_val(f, v, owner):
            if v is None:
                return
            v = DetectedFieldValue(f, v, user=owner)
            prev = fields_to_values.get(f)
            if not prev:
                fields_to_values[f] = [v]
            else:
                prev.append(v)

        for field_alias, field_value_text in document_fields.items():
            if field_value_text is None:
                continue

            field = field_codes_to_fields.get(field_alias.lower())
            if not field:
                task.log_warn(
                    'Field alias "{0}" not found for document type {1}'.format(
                        field_alias, document_type.code))
                continue
            field_type_adapter = FIELD_TYPES_REGISTRY.get(
                field.type)  # type: FieldType
            field_owner = field_owners.get(field_alias)

            if type(field_value_text) is list:
                for possible_value_text in list(field_value_text):
                    maybe_value = field_type_adapter.extract_from_possible_value_text(
                        field, possible_value_text)
                    _maybe_add_val(field, maybe_value, field_owner)
            else:
                maybe_value = field_type_adapter.extract_from_possible_value_text(
                    field, field_value_text)
                _maybe_add_val(field, maybe_value, field_owner)

        return fields_to_values
    def test_field_detector_model(task: ExtendedTask, field_id,
                                  document_id) -> dict:
        document = Document.objects.get(pk=document_id)  # type: Document
        field = DocumentField.objects.get(pk=field_id)  # type: DocumentField

        expected_dfvs = field_detection.detect_and_cache_field_values(
            CeleryTaskLogger(task), document, field,
            save=False)  # type: List[DetectedFieldValue]
        actual_dfvs = list(
            DocumentFieldValue.objects.filter(
                document=document, field=field,
                removed_by_user=False).all())  # type: List[DocumentFieldValue]

        if field.is_value_aware():
            # dates, numbers, e.t.c.
            expected_field_values = field_detection.merge_detected_field_values_to_python_value(
                expected_dfvs)
            expected_field_value = expected_field_values.get(field.code)

            actual_field_values = merge_document_field_values_to_python_value(
                actual_dfvs)
            actual_field_value = actual_field_values.get(field.code)

            matches = bool(expected_field_value == actual_field_value)
        else:
            # related-info e.t.c.
            expected_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in expected_dfvs if dfv.text_unit
            }
            expected_field_value = '; '.join(sorted(expected_set))

            actual_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in actual_dfvs if dfv.text_unit
            }
            actual_field_value = '; '.join(sorted(actual_set))
            matches = bool(expected_set == actual_set)

        if not matches:
            task.log_info(
                '{3} Test doc: {0} (Project: {5}). '
                'Detected: {1}. Real: {2}.\nDetected in text:-----\n{4}\n-----'
                .format(
                    document.name, expected_field_value, actual_field_value,
                    '[  OK  ]' if matches else '[ ERR  ]', '\n---\n'.join(
                        [dfv.text_unit.text
                         for dfv in expected_dfvs]) if expected_dfvs else '',
                    document.project.name if document.project else ''))

        text_units_number = TextUnit.objects.filter(
            document=document, unit_type=field.text_unit_type).count()

        return {
            'text_units_number': text_units_number,
            'value_matches_expected': matches
        }
Ejemplo n.º 12
0
def _reindex_document_ids_packets(task: ExtendedTask, ids_packets: Generator[List[int], None, None]) -> None:
    reindex_task_name = 'Reindex set of documents'
    args = []
    for ids in ids_packets:
        args.append((ids,))
        if len(args) >= 100:
            task.run_sub_tasks(reindex_task_name, cache_document_fields_for_doc_ids_tracked, args)
            args = []
    task.run_sub_tasks(reindex_task_name, cache_document_fields_for_doc_ids_tracked, args)
Ejemplo n.º 13
0
def reindex_all_project_documents(task: ExtendedTask, project_pk: Any) -> None:
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    _reindex_document_ids_packets(
        task,
        get_all_doc_ids_not_planned_to_index_by_project_pk(
            project_pk, DOC_NUM_PER_SUB_TASK))
Ejemplo n.º 14
0
    def local_train_model_for_document_class(task: ExtendedTask,
                                             document_class_name: str):
        task.log_info(
            'Building classifier model for document class: {0}'.format(
                document_class_name))

        classifier_model = ClassifierModel.objects.get(
            kind=ClassifierModel.KIND_SENTENCES_RELATED_TO_FIELDS,
            document_class=document_class_name,
            document_field=None)

        target_names = list(
            ClassifierDataSetEntry.objects.filter(
                field_detection_model=classifier_model).values_list(
                    'category', flat=True).distinct())

        target = []
        data = []
        for target_index, target_name in enumerate(target_names):
            entries = list(
                ClassifierDataSetEntry.objects.filter(
                    field_detection_model=classifier_model,
                    category=target_name).values_list('text', flat=True))
            target.extend(len(entries) * [target_index])
            data.extend(entries)
        target = np.array(target)
        data = np.array(data)

        random_state = check_random_state(seed=None)
        indices = np.arange(data.shape[0])
        random_state.shuffle(indices)
        data = data[indices]
        target = target[indices]

        text_clf = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf',
             SGDClassifier(loss='hinge',
                           penalty='l2',
                           alpha=1e-3,
                           random_state=42,
                           max_iter=5,
                           tol=None)),
        ])
        sklearn_model = text_clf.fit(data, target)

        model = SkLearnClassifierModel(sklearn_model=sklearn_model,
                                       target_names=target_names)

        classifier_model.set_trained_model_obj(model)
        classifier_model.save()

        task.log_info(
            'Trained model based on {0} dataset entries for document class: {1}'
            .format(len(target), document_class_name))
Ejemplo n.º 15
0
    def _detect_field_values_for_document(task: ExtendedTask, document_id,
                                          do_not_write, field_uid):
        doc = Document.objects.get(pk=document_id)

        if doc.status and not doc.status.is_active:
            task.log_info(
                'Forbidden detecting field values for document with "completed"'
                ' status, document #{} ({})'.format(document_id, doc.name))
            return

        document_type = doc.document_type
        fields = [DocumentField.objects.get(
            pk=field_uid)] if field_uid \
            else document_type.fields.all()  # type: List[DocumentField]

        sentence_text_units = list(
            TextUnit.objects.filter(document=doc, unit_type="sentence"))

        detected_counter = 0
        for field in fields:
            # Delete previously detected values
            # to avoid accumulating garbage on each iteration.
            DocumentFieldValue.objects \
                .filter(document=doc,
                        field=field,
                        removed_by_user=False,
                        created_by__isnull=True,
                        modified_by__isnull=True) \
                .delete()

            if field.python_coded_field:
                detected_counter += DetectFieldValues \
                    .detect_field_values_for_python_coded_field(doc, field, sentence_text_units, do_not_write)
            else:
                classifier_model = None
                document_type_field = DocumentTypeField.objects.get(
                    document_type=document_type, document_field=field)

                if not document_type_field.use_regexp_always:
                    try:
                        classifier_model = ClassifierModel.objects \
                            .get(document_type=document_type, document_field=field)
                    except ClassifierModel.DoesNotExist:
                        pass

                if classifier_model:
                    detected_counter += DetectFieldValues \
                        .detect_field_values_with_model(classifier_model, doc, field, sentence_text_units, do_not_write)
                else:
                    detected_counter += DetectFieldValues \
                        .detect_field_values_with_regexps(doc, field, sentence_text_units, do_not_write)

        task.log_info(
            'Detected {0} field values for document #{1} ({2})'.format(
                detected_counter, document_id, doc.name))
def manual_reindex(task: ExtendedTask,
                   document_type_code: str = None,
                   force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    msg = f'manual_reindex called for {document_type_code}. ' \
          f'Task: {task.task_name}, main id: {task.main_task_id}'
    log = CeleryTaskLogger(task)
    log.info(msg)
    adapt_tables_and_reindex(task, document_type_code, force, True)
Ejemplo n.º 17
0
 def start_cache_document_fields_for_doc_ids(cls, task: ExtendedTask, ids: Iterable[Any]) -> None:
     doc_id_pack = set()
     for doc_id in ids:
         doc_id_pack.add(doc_id)
         if len(doc_id_pack) >= 10:
             task.run_sub_tasks('Cache field values for a set of documents',
                                cls.cache_document_fields_for_doc_ids,
                                [(list(doc_id_pack),)])
             doc_id_pack = set()
     if len(doc_id_pack) > 0:
         task.run_sub_tasks('Cache field values for a set of documents', cls.cache_document_fields_for_doc_ids,
                            [(list(doc_id_pack),)])
Ejemplo n.º 18
0
 def convert(task: ExtendedTask,
             document_id: int,
             file_name: str,
             file_path: str):
     with file_storage.get_document_as_local_fn(file_path) as (local_file_path, _):
         from apps.task.tasks import LoadDocuments
         task.log_info(f'Convert PDF to PDF-A, document id={document_id}, file_name={file_name}')
         alt_source_path = LoadDocuments.convert_pdf2pdfa(task, local_file_path, file_path,
                                                          fail_silently=False, force=True)
         if alt_source_path:
             document = Document.objects.get(id=document_id)
             document.alt_source_path = alt_source_path
             document.save()
Ejemplo n.º 19
0
def index_documents(task: ExtendedTask,
                    doc_ids: List,
                    changed_by_user_id: int = None,
                    cache_system_fields: FieldSpec = True,
                    cache_generic_fields: FieldSpec = True,
                    cache_user_fields: bool = True):
    """
    Index documents (cache document fields) in parallel. Document ids set is split to chunks and a
    sub-task is started for each sub-list.
    """
    args = [(sub_list, changed_by_user_id, cache_system_fields, cache_generic_fields, cache_user_fields)
            for sub_list in chunks(doc_ids, DOC_NUM_PER_SUB_TASK)]
    task.run_sub_tasks('Reindex documents', cache_document_fields_for_doc_ids_tracked, args)
Ejemplo n.º 20
0
    def train_document_field_detector_model(
            cls,
            task: ExtendedTask,
            document_type_field: DocumentTypeField,
            train_data_sets: List[Any],
            force=False) -> ClassifierModel:
        document_type = document_type_field.document_type
        field = document_type_field.document_field

        task.log_info('Training model for field #{0} ({1})...'.format(
            field.pk, field.code))

        if not force and document_type_field.use_regexp_always:
            task.log_info(
                'Regexp will be used for document_type #{0} and field #{1}.'.
                format(document_type.pk, field.pk))
        elif not train_data_sets:
            task.log_info(
                'Not enough data to train model for document_type #{0} and field #{1}.'
                .format(document_type.pk, field.pk))
        else:
            classifier_model = cls.train_model(document_type_field,
                                               train_data_sets)
            task.log_info(
                'Finished training model for document_type #{0} and field #{1}. '
                'Total number of samples: {2}'.format(
                    document_type.pk, field.pk,
                    classifier_model.total_samples))

            return classifier_model
Ejemplo n.º 21
0
    def _detect_field_values_for_document(task: ExtendedTask, document_id,
                                          do_not_write):
        doc = Document.objects.get(pk=document_id)

        log = CeleryTaskLogger(task)

        # If the document is in one of completed statuses then
        # the detected values wont be stored even if do_not_write = False.
        # But caching should go as usual.
        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(log, doc, save=not do_not_write)

        task.log_info(
            'Detected {0} field values for document #{1} ({2})'.format(
                len(dfvs), document_id, doc.name))
Ejemplo n.º 22
0
 def process_broken(task: ExtendedTask, obj: Union[FieldAnnotation, FieldValue], delete_broken: bool = False):
     if delete_broken:
         obj.delete()
         task.log_info(f'Found broken {obj.__class__.__name__}.\n'
                       f'Id: {obj.pk}\n'
                       f'JSON value: {obj.value}\n'
                       'The document field value has been deleted.\n')
     else:
         content_type = ContentType.objects.get_for_model(obj.__class__)
         dfv_admin_url = reverse("admin:%s_%s_change" %
                                 (content_type.app_label, content_type.model), args=(obj.pk,))
         task.log_info(f'Found broken {obj.__class__.__name__}.\n'
                       f'Id: {obj.pk}\n'
                       f'JSON value: {obj.value}\n'
                       f'Admin URL: {dfv_admin_url}\n')
Ejemplo n.º 23
0
    def _detect_field_values_for_document(task: ExtendedTask,
                                          document_id,
                                          do_not_write):
        doc = Document.objects.get(pk=document_id)

        if doc.status and not doc.status.is_active:
            task.log_info('Forbidden detecting field values for document with "completed"'
                          ' status, document #{} ({})'.format(document_id, doc.name))
            return

        log = CeleryTaskLogger(task)

        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(log, doc, save=not do_not_write)

        task.log_info('Detected {0} field values for document #{1} ({2})'.format(
            len(dfvs), document_id, doc.name))
    def detect_field_values_for_document(
            task: ExtendedTask, detect_ptrs: DocDetectFieldValuesParams):
        doc = Document.all_objects.get(pk=detect_ptrs.document_id)
        log = CeleryTaskLogger(task)

        # If the document is in one of completed statuses then
        # the detected values wont be stored even if do_not_write = False.
        # But caching should go as usual.
        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(log,
                                                        doc,
                                                        changed_by_user=task.task.user,
                                                        save=not detect_ptrs.do_not_write,
                                                        clear_old_values=detect_ptrs.clear_old_values,
                                                        updated_field_codes=detect_ptrs.updated_field_codes)

        task.log_info(f'Detected {len(dfvs)} field values for document ' +
                      f'#{detect_ptrs.document_id} ({doc.name})')
Ejemplo n.º 25
0
    def load_field_values(task: ExtendedTask, document: Document, document_fields_alias_to_value: Dict[str, Any]) \
            -> Dict[DocumentField, FieldValueDTO]:
        document_type = document.document_type
        fields_to_values = dict()  # type: Dict[DocumentField, FieldValueDTO]

        if not document_type:
            return fields_to_values

        field_code_aliases = document_type.field_code_aliases

        field_codes_to_fields = {f.code.lower(): f for f in document_type.fields.all()}

        if field_code_aliases:
            field_codes_to_fields.update({field_alias.lower(): field_codes_to_fields.get(field_code.lower())
                                          for field_alias, field_code in field_code_aliases.items() if
                                          field_alias and field_code})

        for field_alias, field_value_text in document_fields_alias_to_value.items():
            if field_value_text is None:
                continue

            field = field_codes_to_fields.get(field_alias.lower())  # type: DocumentField
            if not field:
                task.log_warn(
                    'Field alias "{0}" not found for document type {1}'.format(field_alias, document_type.code))
                continue
            typed_field = TypedField.by(field)  # type: TypedField

            if type(field_value_text) is list:
                for possible_value_text in list(field_value_text):
                    maybe_value = typed_field.extract_from_possible_value_text(possible_value_text)
                    if maybe_value:
                        maybe_value = typed_field.field_value_python_to_json(maybe_value)
                        fields_to_values[field] = FieldValueDTO(field_value=maybe_value)
                        break
            else:
                maybe_value = typed_field.extract_from_possible_value_text(field_value_text)
                if maybe_value:
                    maybe_value = typed_field.field_value_python_to_json(maybe_value)
                    fields_to_values[field] = FieldValueDTO(field_value=maybe_value)

        return fields_to_values
Ejemplo n.º 26
0
    def load_doc(task: ExtendedTask,
                 document: Document,
                 field_values_alias_to_value: Dict[str, Any],
                 run_detect_field_values: bool,
                 field_owners: Dict[str, User] = None):
        field_owners = field_owners if field_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value)
        log = CeleryTaskLogger(task)
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None})

            for field, value_dto in fields_to_values.items():
                field_repo.update_field_value_with_dto(document=document,
                                                       field=field,
                                                       field_value_dto=value_dto,
                                                       user=field_owners.get(field.code))

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log=log,
                                                                           document=document,
                                                                           save=True,
                                                                           clear_old_values=False)
            else:
                signals.fire_document_changed(sender=task,
                                              log=log,
                                              document=document,
                                              changed_by_user=None,
                                              document_initial_load=True,
                                              system_fields_changed=True,
                                              generic_fields_changed=True,
                                              user_fields_changed=True)

        task.log_info('Loaded {0} field values for document #{1} ({2}): {3}'
                      .format(len(fields_to_values),
                              document.pk,
                              document.name,
                              ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
Ejemplo n.º 27
0
def manual_reindex(task: ExtendedTask,
                   document_type_code: str = None,
                   force: bool = False,
                   project_id: Optional[int] = None):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    run_parameters = {'document type': document_type_code}
    if project_id:
        run_parameters['project'] = project_id
    if force:
        run_parameters['force'] = True
    ptrs_str = ', '.join([f'{p}={run_parameters[p]}' for p in run_parameters])

    msg = f'manual_reindex called for {ptrs_str}. ' \
          f'Task: {task.task_name}, main id: {task.main_task_id}'
    log = CeleryTaskLogger(task)
    log.info(msg)
    adapt_tables_and_reindex(task, document_type_code, force, True, project_id)
    def join_field_detector_model_tests(task: ExtendedTask, field_uid,
                                        document_type_uid,
                                        classifier_model_id):
        results = list(Task.objects \
                       .filter(main_task_id=task.request.parent_id,
                               name=TrainAndTest.test_field_detector_model.name) \
                       .values_list('result', flat=True))

        test_text_units_number = 0
        match_number = 0
        test_doc_number = 0

        for res in results:
            test_doc_number += 1
            test_text_units_number += (res.get('text_units_number') or 0)
            if res.get('value_matches_expected'):
                match_number += 1

        accuracy = match_number / test_doc_number

        if classifier_model_id:
            classifier_model = ClassifierModel.objects.get(
                pk=classifier_model_id)
            classifier_model.field_detection_accuracy = accuracy
            classifier_model.save()

        field = DocumentField.objects.get(pk=field_uid)
        document_type = DocumentType.objects.get(pk=document_type_uid)

        task.log_info('Testing finished.\n'
                      'Document type: {0}.\n'
                      'Field: {1}.\n'
                      'Text unit type: {2}.\n'
                      'Test documents number: {3}.\n'
                      'Test text units number: {4}.\n'
                      'Accuracy: {5}.\n'.format(document_type.code, field.code,
                                                field.text_unit_type,
                                                test_doc_number,
                                                test_text_units_number,
                                                accuracy))
Ejemplo n.º 29
0
def adapt_tables_and_reindex(task: ExtendedTask, document_type_code: str = None, force_recreate_tables: bool = False,
                             force_reindex: bool = False):
    """
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :return:
    """
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id, document_type.code):
                purge_task(prev_task)
            args = [(ids,) for ids in get_all_doc_ids(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            task.log_info('Raw DB table for document type {0} has been altered. '
                          'Initiating re-index for all documents of this document type.'.format(document_type.code))
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            args = [(ids,) for ids in get_all_doc_ids_not_planned_to_index_by_doc_type(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            args = [(ids,) for ids in non_indexed_doc_ids_not_planned_to_index(document_type, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
Ejemplo n.º 30
0
    def join_field_detector_model_tests(task: ExtendedTask,
                                        classifier_model_id: Any) -> None:
        results = Task.objects \
            .filter(main_task_id=task.request.parent_id, name=TrainDocumentField.test_field_detector_model.name) \
            .values_list('result', flat=True)
        valid_fields_metric = None
        wrong_fields_metric = None
        sentences_number = 0
        values_number = 0
        valid_values_number = 0
        wrong_values_number = 0
        able_to_predict = 0

        for result in results:
            sentences_number += result['sentences_number']
            values_number += result['values_number']
            valid_values_number += result['valid_values_number']
            wrong_values_number += result['wrong_values_number']
            able_to_predict += result['able_to_predict']
        if values_number > 0:
            valid_fields_metric = valid_values_number * 100 / values_number
        if sentences_number - values_number > 0:
            wrong_fields_metric = wrong_values_number * 100 / (
                sentences_number - values_number)

        classifier_model = ClassifierModel.objects.get(pk=classifier_model_id)
        classifier_model.test_valid_fields_metric = valid_fields_metric
        classifier_model.test_wrong_fields_metric = wrong_fields_metric
        classifier_model.test_sentences_number = sentences_number
        classifier_model.test_values_number = values_number
        classifier_model.predicted_valid_values_number = valid_values_number
        classifier_model.predicted_wrong_values_number = wrong_values_number
        classifier_model.able_to_predict = able_to_predict
        classifier_model.save()

        task.log_info(
            'Testing of field detector model finished. valid_fields_metric={0}, wrong_fields_metric={1}'
            .format(str(valid_fields_metric), str(wrong_fields_metric)))