def auto_reindex_not_tracked(task: ExtendedTask,
                             document_type_code: str = None,
                             force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        return
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    task_model = task.task

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force)
        if reindex_needed:
            force_fmt = ', forced' if force else ''
            task.log_info(
                f'Re-index from auto_reindex_not_tracked, {task.name}, '
                f'for {document_type}{force_fmt}')
            call_task_func(manual_reindex, (document_type.code, False),
                           task_model.user_id)
        else:
            if there_are_non_indexed_docs_not_planned_to_index(document_type, log) \
                    and not any_other_reindex_task(task.request.id, document_type.code).exists():
                task.log_info(
                    f'auto_reindex_not_tracked({document_type.code}): '
                    f'there_are_non_indexed_docs_not_planned_to_index')
                call_task_func(manual_reindex, (document_type.code, False),
                               task_model.user_id)
Ejemplo n.º 2
0
def reindex_status_name_for_all_documents_in_system(task: ExtendedTask, status_pk: Any) -> None:
    from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info('Document caching to raw tables is disabled in Commons / App Vars')
        return
    _reindex_document_ids_packets(task,
                                  get_all_doc_ids_not_planned_to_index_by_status_pk(status_pk, DOC_NUM_PER_SUB_TASK))
Ejemplo n.º 3
0
    def join_field_detector_model_tests(task: ExtendedTask, field_uid,
                                        classifier_model_id):
        results = list(Task.objects \
                       .filter(main_task_id=task.request.parent_id,
                               name=TrainAndTest.test_field_detector_model.name)
                       .values_list('result', flat=True))

        test_text_units_number = 0
        match_number = 0
        test_doc_number = 0

        matches_per_value = dict()
        total_per_value = dict()

        for res in results:
            actual_field_value = res.get('actual_field_value')

            if actual_field_value:
                if actual_field_value not in total_per_value:
                    total_per_value[actual_field_value] = 0
                total_per_value[actual_field_value] += 1

            test_doc_number += 1
            test_text_units_number += (res.get('text_units_number') or 0)
            if res.get('value_matches_expected'):
                match_number += 1
                if actual_field_value:
                    if actual_field_value not in matches_per_value:
                        matches_per_value[actual_field_value] = 0
                    matches_per_value[actual_field_value] += 1

        accuracy = match_number / test_doc_number

        if classifier_model_id:
            classifier_model = ClassifierModel.objects.get(
                pk=classifier_model_id)
            classifier_model.field_detection_accuracy = accuracy
            classifier_model.save()

        field = DocumentField.objects.get(pk=field_uid)

        task.log_info(
            'Testing finished.\n'
            'Document type: {0}.\n'
            'Field: {1}.\n'
            'Text unit type: {2}.\n'
            'Test documents number: {3}.\n'
            'Test text units number: {4}.\n'
            'Accuracy: {5}.\n'.format(field.document_type.code, field.code,
                                      field.text_unit_type, test_doc_number,
                                      test_text_units_number, accuracy))

        if field.is_choice_field():
            accuracy_per_value = {
                actual_field_value:
                (matches_per_value.get(actual_field_value) or 0) / total
                for actual_field_value, total in total_per_value.items()
            }
            task.log_info('Accuracy per value:\n{0}'.format(
                json.dumps(accuracy_per_value, sort_keys=True, indent=2)))
Ejemplo n.º 4
0
    def check_document_field_values(task: ExtendedTask, dfv_ids: Set, delete_broken: bool = False):

        for dfv in DocumentFieldValue.objects \
                .filter(pk__in=dfv_ids) \
                .select_related('field'):  # type: DocumentFieldValue
            try:
                temp_value = dfv.python_value
                if temp_value is not None:
                    field = dfv.field
                    if field.is_choice_field() and not field.is_choice_value(temp_value):
                        raise ValueError('Field value {0} is not in list of its choice values:\n{1}'
                                         .format(temp_value, field.choices))
            except:
                if delete_broken:
                    dfv.delete()
                    msg = render_error('Found broken document field value.\n'
                                       'Document field value id: {0}\n'
                                       'DB value: {1}\n'
                                       'The document field value has been deleted.\n'
                                       .format(dfv.pk, dfv.value))
                else:
                    content_type = ContentType.objects.get_for_model(DocumentFieldValue)
                    dfv_admin_url = reverse("admin:%s_%s_change" %
                                            (content_type.app_label, content_type.model), args=(dfv.pk,))
                    msg = render_error('Found broken document field value.\n'
                                       'Document field value id: {0}\n'
                                       'DB value: {1}\n'
                                       'Admin URL: {2}\n'.format(dfv.pk, dfv.value, dfv_admin_url))
                task.log_info(msg)
Ejemplo n.º 5
0
    def load_doc(task: ExtendedTask, document: Document, document_fields: Dict, run_detect_field_values: bool,
                 filed_owners: dict = None):
        filed_owners = filed_owners if filed_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, document_fields, filed_owners)
        log = CeleryTaskLogger(task)

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            if not new_document:
                DocumentFieldValue.objects \
                    .filter(document=document,
                            removed_by_user=False,
                            created_by__isnull=True,
                            modified_by__isnull=True) \
                    .delete()

            for field, values in fields_to_values.items():
                field_detection.save_detected_values(document, field, values)

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log, document, True)
            else:
                dfvs = field_detection.detect_and_cache_field_values_for_document(log, document, False)
                field_value_cache.cache_field_values(document, dfvs, save=True)

        task.log_info('Loaded {0} field values for document #{1} ({2})'
                      .format(len(fields_to_values), document.pk, document.name))
Ejemplo n.º 6
0
    def detect_and_process_lease_document(task: ExtendedTask, document_id: int, no_detect: bool):
        doc = Document.objects.get(pk=document_id)
        doc_text = doc.full_text

        try:
            lease_doc = LeaseDocument.objects.get(pk=document_id)
        except:
            lease_doc = None

        if lease_doc or no_detect or ProcessLeaseDocuments.lease_doc_detector.is_lease_document(
                doc_text):
            task.log_info('{2} lease document: #{0}. {1}'
                               .format(document_id,
                                       doc.name,
                                       'Processing' if no_detect else 'Detected'))
            if not lease_doc:
                lease_doc = LeaseDocument(document_ptr=doc)
                lease_doc.__dict__.update(doc.__dict__)

            ProcessLeaseDocuments.process_landlord_tenant(lease_doc, doc_text)
            ProcessLeaseDocuments.process_fields(lease_doc, doc_text, task)

            lease_doc.save()

        else:
            task.log_info('Not a lease document: #{0}. {1}'.format(document_id, doc.name))
def update_status_name_for_documents(task: ExtendedTask,
                                     status_pk: Any) -> None:
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    _reindex_document_ids_packets(
        task, get_all_doc_ids_not_planned_to_index_by_status_pk(status_pk, 20))
Ejemplo n.º 8
0
def manual_reindex(task: ExtendedTask,
                   document_type_code: str = None,
                   force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    adapt_tables_and_reindex(task, document_type_code, force, True)
Ejemplo n.º 9
0
def reindex_all_project_documents(task: ExtendedTask, project_pk: Any) -> None:
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    _reindex_document_ids_packets(
        task,
        get_all_doc_ids_not_planned_to_index_by_project_pk(
            project_pk, DOC_NUM_PER_SUB_TASK))
    def test_field_detector_model(task: ExtendedTask, field_id,
                                  document_id) -> dict:
        document = Document.objects.get(pk=document_id)  # type: Document
        field = DocumentField.objects.get(pk=field_id)  # type: DocumentField

        expected_dfvs = field_detection.detect_and_cache_field_values(
            CeleryTaskLogger(task), document, field,
            save=False)  # type: List[DetectedFieldValue]
        actual_dfvs = list(
            DocumentFieldValue.objects.filter(
                document=document, field=field,
                removed_by_user=False).all())  # type: List[DocumentFieldValue]

        if field.is_value_aware():
            # dates, numbers, e.t.c.
            expected_field_values = field_detection.merge_detected_field_values_to_python_value(
                expected_dfvs)
            expected_field_value = expected_field_values.get(field.code)

            actual_field_values = merge_document_field_values_to_python_value(
                actual_dfvs)
            actual_field_value = actual_field_values.get(field.code)

            matches = bool(expected_field_value == actual_field_value)
        else:
            # related-info e.t.c.
            expected_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in expected_dfvs if dfv.text_unit
            }
            expected_field_value = '; '.join(sorted(expected_set))

            actual_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in actual_dfvs if dfv.text_unit
            }
            actual_field_value = '; '.join(sorted(actual_set))
            matches = bool(expected_set == actual_set)

        if not matches:
            task.log_info(
                '{3} Test doc: {0} (Project: {5}). '
                'Detected: {1}. Real: {2}.\nDetected in text:-----\n{4}\n-----'
                .format(
                    document.name, expected_field_value, actual_field_value,
                    '[  OK  ]' if matches else '[ ERR  ]', '\n---\n'.join(
                        [dfv.text_unit.text
                         for dfv in expected_dfvs]) if expected_dfvs else '',
                    document.project.name if document.project else ''))

        text_units_number = TextUnit.objects.filter(
            document=document, unit_type=field.text_unit_type).count()

        return {
            'text_units_number': text_units_number,
            'value_matches_expected': matches
        }
Ejemplo n.º 11
0
    def local_train_model_for_document_class(task: ExtendedTask,
                                             document_class_name: str):
        task.log_info(
            'Building classifier model for document class: {0}'.format(
                document_class_name))

        classifier_model = ClassifierModel.objects.get(
            kind=ClassifierModel.KIND_SENTENCES_RELATED_TO_FIELDS,
            document_class=document_class_name,
            document_field=None)

        target_names = list(
            ClassifierDataSetEntry.objects.filter(
                field_detection_model=classifier_model).values_list(
                    'category', flat=True).distinct())

        target = []
        data = []
        for target_index, target_name in enumerate(target_names):
            entries = list(
                ClassifierDataSetEntry.objects.filter(
                    field_detection_model=classifier_model,
                    category=target_name).values_list('text', flat=True))
            target.extend(len(entries) * [target_index])
            data.extend(entries)
        target = np.array(target)
        data = np.array(data)

        random_state = check_random_state(seed=None)
        indices = np.arange(data.shape[0])
        random_state.shuffle(indices)
        data = data[indices]
        target = target[indices]

        text_clf = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf',
             SGDClassifier(loss='hinge',
                           penalty='l2',
                           alpha=1e-3,
                           random_state=42,
                           max_iter=5,
                           tol=None)),
        ])
        sklearn_model = text_clf.fit(data, target)

        model = SkLearnClassifierModel(sklearn_model=sklearn_model,
                                       target_names=target_names)

        classifier_model.set_trained_model_obj(model)
        classifier_model.save()

        task.log_info(
            'Trained model based on {0} dataset entries for document class: {1}'
            .format(len(target), document_class_name))
Ejemplo n.º 12
0
    def _detect_field_values_for_document(task: ExtendedTask, document_id,
                                          do_not_write, field_uid):
        doc = Document.objects.get(pk=document_id)

        if doc.status and not doc.status.is_active:
            task.log_info(
                'Forbidden detecting field values for document with "completed"'
                ' status, document #{} ({})'.format(document_id, doc.name))
            return

        document_type = doc.document_type
        fields = [DocumentField.objects.get(
            pk=field_uid)] if field_uid \
            else document_type.fields.all()  # type: List[DocumentField]

        sentence_text_units = list(
            TextUnit.objects.filter(document=doc, unit_type="sentence"))

        detected_counter = 0
        for field in fields:
            # Delete previously detected values
            # to avoid accumulating garbage on each iteration.
            DocumentFieldValue.objects \
                .filter(document=doc,
                        field=field,
                        removed_by_user=False,
                        created_by__isnull=True,
                        modified_by__isnull=True) \
                .delete()

            if field.python_coded_field:
                detected_counter += DetectFieldValues \
                    .detect_field_values_for_python_coded_field(doc, field, sentence_text_units, do_not_write)
            else:
                classifier_model = None
                document_type_field = DocumentTypeField.objects.get(
                    document_type=document_type, document_field=field)

                if not document_type_field.use_regexp_always:
                    try:
                        classifier_model = ClassifierModel.objects \
                            .get(document_type=document_type, document_field=field)
                    except ClassifierModel.DoesNotExist:
                        pass

                if classifier_model:
                    detected_counter += DetectFieldValues \
                        .detect_field_values_with_model(classifier_model, doc, field, sentence_text_units, do_not_write)
                else:
                    detected_counter += DetectFieldValues \
                        .detect_field_values_with_regexps(doc, field, sentence_text_units, do_not_write)

        task.log_info(
            'Detected {0} field values for document #{1} ({2})'.format(
                detected_counter, document_id, doc.name))
def manual_reindex(task: ExtendedTask,
                   document_type_code: str = None,
                   force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    msg = f'manual_reindex called for {document_type_code}. ' \
          f'Task: {task.task_name}, main id: {task.main_task_id}'
    log = CeleryTaskLogger(task)
    log.info(msg)
    adapt_tables_and_reindex(task, document_type_code, force, True)
Ejemplo n.º 14
0
 def convert(task: ExtendedTask,
             document_id: int,
             file_name: str,
             file_path: str):
     with file_storage.get_document_as_local_fn(file_path) as (local_file_path, _):
         from apps.task.tasks import LoadDocuments
         task.log_info(f'Convert PDF to PDF-A, document id={document_id}, file_name={file_name}')
         alt_source_path = LoadDocuments.convert_pdf2pdfa(task, local_file_path, file_path,
                                                          fail_silently=False, force=True)
         if alt_source_path:
             document = Document.objects.get(id=document_id)
             document.alt_source_path = alt_source_path
             document.save()
Ejemplo n.º 15
0
    def train_document_field_detector_model(
            cls,
            task: ExtendedTask,
            document_type_field: DocumentTypeField,
            train_data_sets: List[Any],
            force=False) -> ClassifierModel:
        document_type = document_type_field.document_type
        field = document_type_field.document_field

        task.log_info('Training model for field #{0} ({1})...'.format(
            field.pk, field.code))

        if not force and document_type_field.use_regexp_always:
            task.log_info(
                'Regexp will be used for document_type #{0} and field #{1}.'.
                format(document_type.pk, field.pk))
        elif not train_data_sets:
            task.log_info(
                'Not enough data to train model for document_type #{0} and field #{1}.'
                .format(document_type.pk, field.pk))
        else:
            classifier_model = cls.train_model(document_type_field,
                                               train_data_sets)
            task.log_info(
                'Finished training model for document_type #{0} and field #{1}. '
                'Total number of samples: {2}'.format(
                    document_type.pk, field.pk,
                    classifier_model.total_samples))

            return classifier_model
Ejemplo n.º 16
0
 def process_broken(task: ExtendedTask, obj: Union[FieldAnnotation, FieldValue], delete_broken: bool = False):
     if delete_broken:
         obj.delete()
         task.log_info(f'Found broken {obj.__class__.__name__}.\n'
                       f'Id: {obj.pk}\n'
                       f'JSON value: {obj.value}\n'
                       'The document field value has been deleted.\n')
     else:
         content_type = ContentType.objects.get_for_model(obj.__class__)
         dfv_admin_url = reverse("admin:%s_%s_change" %
                                 (content_type.app_label, content_type.model), args=(obj.pk,))
         task.log_info(f'Found broken {obj.__class__.__name__}.\n'
                       f'Id: {obj.pk}\n'
                       f'JSON value: {obj.value}\n'
                       f'Admin URL: {dfv_admin_url}\n')
Ejemplo n.º 17
0
    def _detect_field_values_for_document(task: ExtendedTask, document_id,
                                          do_not_write):
        doc = Document.objects.get(pk=document_id)

        log = CeleryTaskLogger(task)

        # If the document is in one of completed statuses then
        # the detected values wont be stored even if do_not_write = False.
        # But caching should go as usual.
        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(log, doc, save=not do_not_write)

        task.log_info(
            'Detected {0} field values for document #{1} ({2})'.format(
                len(dfvs), document_id, doc.name))
Ejemplo n.º 18
0
def adapt_tables_and_reindex(task: ExtendedTask, document_type_code: str = None, force_recreate_tables: bool = False,
                             force_reindex: bool = False):
    """
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :return:
    """
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id, document_type.code):
                purge_task(prev_task)
            args = [(ids,) for ids in get_all_doc_ids(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            task.log_info('Raw DB table for document type {0} has been altered. '
                          'Initiating re-index for all documents of this document type.'.format(document_type.code))
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            args = [(ids,) for ids in get_all_doc_ids_not_planned_to_index_by_doc_type(document_type.uid, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            args = [(ids,) for ids in non_indexed_doc_ids_not_planned_to_index(document_type, 20)]
            task.run_sub_tasks('Reindex set of documents', cache_document_fields_for_doc_ids_tracked, args)
Ejemplo n.º 19
0
    def run_train_model_for_field(task: ExtendedTask, document_type_uid,
                                  field_uid, no_field_sentences,
                                  trigger_re_detecting_field_values):
        document_type = DocumentType.objects.get(pk=document_type_uid)
        field = DocumentField.objects.get(pk=field_uid)

        task.log_info('Training model for field #{0} ({1})...'.format(
            field_uid, field.code))

        document_type_field = DocumentTypeField.objects.get(
            document_type=document_type, document_field=field)

        if document_type_field.use_regexp_always:
            task.log_info(
                'Regexp will used for document_type #{0} and field #{1}.'.
                format(document_type_uid, field_uid))
            return None

        train_data = TrainDocumentFieldDetectorModel.get_train_data(
            document_type, field,
            document_type_field.trained_after_documents_number)
        if train_data:
            train_data = TrainDirtyDocumentFieldDetectorModel.get_train_data_generator(
                train_data)
            total_samples = TrainDirtyDocumentFieldDetectorModel.train_model(
                train_data, document_type, field, no_field_sentences)
            task.log_info(
                'Finished training model for document_type #{0} and field #{1}. '
                'Total number of samples: {2}'.format(document_type_uid,
                                                      field_uid,
                                                      total_samples))
        else:
            task.log_info(
                'Not enough data to train model for document_type #{0} and field #{1}.'
                .format(document_type_uid, field_uid))
            return None

        if trigger_re_detecting_field_values:
            detect_field_values_for_document_args = []

            documents = Document.objects.active()\
                .filter(document_type=document_type)\
                .values_list('pk', 'name', 'source')
            source_data = []

            for document_id, name, source in documents:
                if source:
                    source_data.append('{0}/{1}'.format(source, name))
                else:
                    source_data.append(name)

                detect_field_values_for_document_args.append(
                    (document_id, False, field_uid))
            task.run_sub_tasks(
                'Detect Values of Field {0} for Each Document'.format(
                    field.code),
                DetectFieldValues.detect_field_values_for_document,
                detect_field_values_for_document_args, source_data)
Ejemplo n.º 20
0
    def _detect_field_values_for_document(task: ExtendedTask,
                                          document_id,
                                          do_not_write):
        doc = Document.objects.get(pk=document_id)

        if doc.status and not doc.status.is_active:
            task.log_info('Forbidden detecting field values for document with "completed"'
                          ' status, document #{} ({})'.format(document_id, doc.name))
            return

        log = CeleryTaskLogger(task)

        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(log, doc, save=not do_not_write)

        task.log_info('Detected {0} field values for document #{1} ({2})'.format(
            len(dfvs), document_id, doc.name))
    def detect_field_values_for_document(
            task: ExtendedTask, detect_ptrs: DocDetectFieldValuesParams):
        doc = Document.all_objects.get(pk=detect_ptrs.document_id)
        log = CeleryTaskLogger(task)

        # If the document is in one of completed statuses then
        # the detected values wont be stored even if do_not_write = False.
        # But caching should go as usual.
        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(log,
                                                        doc,
                                                        changed_by_user=task.task.user,
                                                        save=not detect_ptrs.do_not_write,
                                                        clear_old_values=detect_ptrs.clear_old_values,
                                                        updated_field_codes=detect_ptrs.updated_field_codes)

        task.log_info(f'Detected {len(dfvs)} field values for document ' +
                      f'#{detect_ptrs.document_id} ({doc.name})')
Ejemplo n.º 22
0
    def load_doc(task: ExtendedTask,
                 document: Document,
                 field_values_alias_to_value: Dict[str, Any],
                 run_detect_field_values: bool,
                 field_owners: Dict[str, User] = None):
        field_owners = field_owners if field_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value)
        log = CeleryTaskLogger(task)
        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None})

            for field, value_dto in fields_to_values.items():
                field_repo.update_field_value_with_dto(document=document,
                                                       field=field,
                                                       field_value_dto=value_dto,
                                                       user=field_owners.get(field.code))

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(log=log,
                                                                           document=document,
                                                                           save=True,
                                                                           clear_old_values=False)
            else:
                signals.fire_document_changed(sender=task,
                                              log=log,
                                              document=document,
                                              changed_by_user=None,
                                              document_initial_load=True,
                                              system_fields_changed=True,
                                              generic_fields_changed=True,
                                              user_fields_changed=True)

        task.log_info('Loaded {0} field values for document #{1} ({2}): {3}'
                      .format(len(fields_to_values),
                              document.pk,
                              document.name,
                              ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
Ejemplo n.º 23
0
def manual_reindex(task: ExtendedTask,
                   document_type_code: str = None,
                   force: bool = False,
                   project_id: Optional[int] = None):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    run_parameters = {'document type': document_type_code}
    if project_id:
        run_parameters['project'] = project_id
    if force:
        run_parameters['force'] = True
    ptrs_str = ', '.join([f'{p}={run_parameters[p]}' for p in run_parameters])

    msg = f'manual_reindex called for {ptrs_str}. ' \
          f'Task: {task.task_name}, main id: {task.main_task_id}'
    log = CeleryTaskLogger(task)
    log.info(msg)
    adapt_tables_and_reindex(task, document_type_code, force, True, project_id)
    def join_field_detector_model_tests(task: ExtendedTask, field_uid,
                                        document_type_uid,
                                        classifier_model_id):
        results = list(Task.objects \
                       .filter(main_task_id=task.request.parent_id,
                               name=TrainAndTest.test_field_detector_model.name) \
                       .values_list('result', flat=True))

        test_text_units_number = 0
        match_number = 0
        test_doc_number = 0

        for res in results:
            test_doc_number += 1
            test_text_units_number += (res.get('text_units_number') or 0)
            if res.get('value_matches_expected'):
                match_number += 1

        accuracy = match_number / test_doc_number

        if classifier_model_id:
            classifier_model = ClassifierModel.objects.get(
                pk=classifier_model_id)
            classifier_model.field_detection_accuracy = accuracy
            classifier_model.save()

        field = DocumentField.objects.get(pk=field_uid)
        document_type = DocumentType.objects.get(pk=document_type_uid)

        task.log_info('Testing finished.\n'
                      'Document type: {0}.\n'
                      'Field: {1}.\n'
                      'Text unit type: {2}.\n'
                      'Test documents number: {3}.\n'
                      'Test text units number: {4}.\n'
                      'Accuracy: {5}.\n'.format(document_type.code, field.code,
                                                field.text_unit_type,
                                                test_doc_number,
                                                test_text_units_number,
                                                accuracy))
Ejemplo n.º 25
0
    def join_field_detector_model_tests(task: ExtendedTask,
                                        classifier_model_id: Any) -> None:
        results = Task.objects \
            .filter(main_task_id=task.request.parent_id, name=TrainDocumentField.test_field_detector_model.name) \
            .values_list('result', flat=True)
        valid_fields_metric = None
        wrong_fields_metric = None
        sentences_number = 0
        values_number = 0
        valid_values_number = 0
        wrong_values_number = 0
        able_to_predict = 0

        for result in results:
            sentences_number += result['sentences_number']
            values_number += result['values_number']
            valid_values_number += result['valid_values_number']
            wrong_values_number += result['wrong_values_number']
            able_to_predict += result['able_to_predict']
        if values_number > 0:
            valid_fields_metric = valid_values_number * 100 / values_number
        if sentences_number - values_number > 0:
            wrong_fields_metric = wrong_values_number * 100 / (
                sentences_number - values_number)

        classifier_model = ClassifierModel.objects.get(pk=classifier_model_id)
        classifier_model.test_valid_fields_metric = valid_fields_metric
        classifier_model.test_wrong_fields_metric = wrong_fields_metric
        classifier_model.test_sentences_number = sentences_number
        classifier_model.test_values_number = values_number
        classifier_model.predicted_valid_values_number = valid_values_number
        classifier_model.predicted_wrong_values_number = wrong_values_number
        classifier_model.able_to_predict = able_to_predict
        classifier_model.save()

        task.log_info(
            'Testing of field detector model finished. valid_fields_metric={0}, wrong_fields_metric={1}'
            .format(str(valid_fields_metric), str(wrong_fields_metric)))
Ejemplo n.º 26
0
    def sync_imanage_document(task: ExtendedTask, imanage_config_id: int, imanage_doc_id: str):
        task.log_info('Synchronizing iManage document #{0} or config #{1}'.format(imanage_doc_id, imanage_config_id))
        imanage_doc = IManageDocument.objects \
            .filter(imanage_config_id=imanage_config_id, imanage_doc_id=imanage_doc_id) \
            .select_related('imanage_config').get()
        file_storage = get_file_storage()
        try:
            imanage_config = imanage_doc.imanage_config
            log = CeleryTaskLogger(task)
            project = imanage_config.resolve_dst_project(imanage_doc.imanage_doc_data, log)
            project_id = project.pk

            assignee = imanage_config.resolve_assignee(imanage_doc.imanage_doc_data, log)
            assignee_id = assignee.pk if assignee else None
            task.log_info('Assignee resolved to: {0}'.format(assignee.get_full_name() if assignee else '<no assignee>'))

            task.log_info('Downloading iManage document contents into a temp file...')
            auth_token = imanage_config.login()
            filename, response = imanage_config.load_document(auth_token, imanage_doc_id)

            upload_session_id = str(uuid.uuid4())
            filename = get_valid_filename(filename)
            rel_filepath = os.path.join(upload_session_id, filename)

            _, ext = os.path.splitext(filename) if filename else None
            with buffer_contents_into_temp_file(response, ext) as temp_fn:

                # upload file to file storage
                with open(temp_fn, 'rb') as f:
                    file_storage.mk_doc_dir(upload_session_id)
                    file_storage.write_document(rel_filepath, f)

                kwargs = {
                    'document_type_id': imanage_config.document_type_id,
                    'project_id': project_id,
                    'assignee_id': assignee_id,
                    'user_id': get_main_admin_user().pk,
                    'propagate_exception': True,
                    'run_standard_locators': True,
                    'metadata': {},
                    'do_not_check_exists': True
                }

                pre_defined_fields = None
                if imanage_doc.imanage_doc_data and imanage_config.imanage_to_contraxsuite_field_binding:
                    pre_defined_fields = dict()
                    for imanage_field_code, contraxsuite_field_code \
                            in dict(imanage_config.imanage_to_contraxsuite_field_binding).items():
                        imanage_field_value = imanage_doc.imanage_doc_data.get(imanage_field_code)
                        if imanage_field_value:
                            pre_defined_fields[contraxsuite_field_code] = imanage_field_value
                            task.log_info('Assigning iManage field {0} to Contraxsuite field {1}: {2}'
                                          .format(imanage_field_code, contraxsuite_field_code, imanage_field_value))
                        else:
                            task.log_info('iManage field {0} has no value assigned.'
                                          .format(imanage_field_code))
                else:
                    task.log_info('No binding of iManage fields to Contraxsuite fields.')

                document_id = LoadDocuments \
                    .create_document_local(task, temp_fn, rel_filepath, kwargs,
                                           return_doc_id=True,
                                           pre_defined_doc_fields_code_to_python_val=pre_defined_fields)

                if document_id:
                    task.log_info('Created Contraxsuite document #{0}'.format(document_id))
                    imanage_doc.document_id = document_id
                    imanage_doc.last_sync_date = timezone.now()
                    imanage_doc.save(update_fields=['document_id', 'last_sync_date'])
                else:
                    task.log_error('Unable to create Contraxsuite document for '
                                   'iManage document #{0}'.format(imanage_doc_id))
                    raise RuntimeError('No document loaded.')
        except Exception as ex:
            msg = render_error('Unable to synchronize iManage document #{0}'.format(imanage_doc_id), ex)
            task.log_error(msg)
            imanage_doc.import_problem = True
            imanage_doc.save(update_fields=['import_problem'])
Ejemplo n.º 27
0
    def test_field_detector_model(task: ExtendedTask, classifier_model_id: Any,
                                  test_document_id: Any) -> dict:
        classifier_model = ClassifierModel.objects.get(pk=classifier_model_id)
        document = Document.objects.get(pk=test_document_id)
        field = classifier_model.document_field
        sklearn_model = classifier_model.get_trained_model_obj()
        field_type_adapter = FIELD_TYPES_REGISTRY[field.type]
        text_unit_with_value_ids = set()
        able_to_predict_text_units_ids = set()
        sentences_number = 0
        values_number = 0
        valid_values_number = 0
        able_to_predict = 0

        task.log_info(
            'Testing field detector model for document #{0}, field {1}...'.
            format(test_document_id, field.code))

        for text_unit in TextUnit.objects.filter(document_id=test_document_id,
                                                 unit_type="sentence"):
            value_found = not (field_type_adapter.multi_value or field.is_choice_field()) \
                          and len(text_unit_with_value_ids) > 0
            if not value_found:
                sentences_number += 1

            field_uid, value, hint_name = DetectFieldValues.predict_value(
                sklearn_model=sklearn_model, text_unit=text_unit)
            if field_uid is not None:
                if field_type_adapter.value_extracting:
                    value, hint_name = DetectFieldValues.extract_value(
                        field_type_adapter=field_type_adapter,
                        document=document,
                        field=field,
                        hint_name=hint_name,
                        text_unit=text_unit)
                else:
                    value = True
            if value is not None:
                able_to_predict_text_units_ids.add(text_unit.pk)
                if not value_found:
                    text_unit_with_value_ids.add(text_unit.pk)

        test_values = DocumentFieldValue.objects \
            .filter(document_id=test_document_id, field_id=field.pk) \
            .values_list('sentence_id', flat=True) \
            .distinct('sentence_id') \
            .order_by('sentence_id')
        for text_unit_id in test_values:
            values_number += 1
            if text_unit_id in text_unit_with_value_ids:
                text_unit_with_value_ids.remove(text_unit_id)
                valid_values_number += 1
                able_to_predict += 1
            elif text_unit_id in able_to_predict_text_units_ids:
                able_to_predict += 1

        task.log_info('Testing on document #{0}, field {1} finished'.format(
            test_document_id, field.code))
        return {
            'sentences_number': sentences_number,
            'values_number': values_number,
            'valid_values_number': valid_values_number,
            'wrong_values_number': len(text_unit_with_value_ids),
            'able_to_predict': able_to_predict
        }
Ejemplo n.º 28
0
    def build_dataset_on_document(task: ExtendedTask, document_class_name: str,
                                  document_id, retrain_model: bool):
        field_configs = DOCUMENT_FIELDS[document_class_name]
        if not field_configs:
            return

        document_class = BuildFieldDetectorDataset._get_doc_class(
            document_class_name)

        doc = document_class.objects.get(pk=document_id)

        classifier_model, created = ClassifierModel.objects.get_or_create(
            kind=ClassifierModel.KIND_SENTENCES_RELATED_TO_FIELDS,
            document_class=document_class_name,
            document_field=None)

        deleted, rows_count = ClassifierDataSetEntry.objects.filter(
            field_detection_model=classifier_model, document=doc).delete()
        if deleted > 0:
            task.log_info(
                'Deleted {0} data set entries of document {1}'.format(
                    deleted, doc.pk))

        def add(code, sentence):
            ClassifierDataSetEntry.objects.create(
                field_detection_model=classifier_model,
                document=doc,
                category=code,
                text=sentence)

            task.log_info('Extracting training data from document: {0}'.format(
                doc.pk))

        text = doc.full_text
        annotations = list(
            DocumentAnnotation.objects.filter(document__pk=doc.pk))
        sentence_spans = get_sentence_span_list(text)
        for span in sentence_spans:
            sentence = text[span[0]:span[1]]
            annotated_fields = set()
            added = False
            if annotations:
                for a in annotations:
                    if a.document_field \
                            and a.start_offset <= span[1] and span[0] <= a.end_offset:
                        field_code = a.document_field.pk
                        add(field_code, sentence)
                        annotated_fields.add(field_code)
                        added = True

            for field_config in field_configs.values():
                if field_config.field_code not in annotated_fields \
                        and field_config.sentence_matches_field_detectors(sentence):
                    add(field_config.field_code, sentence)
                    added = True
            if not added:
                add('', sentence)
            task.log_info('Processed {0} sentences of document {1}'.format(
                len(sentence_spans), doc.pk))

        if retrain_model:
            TrainFieldDetectorModel.train_model_for_document_class.apply_async(
                args=(document_class_name, ))
Ejemplo n.º 29
0
    def parse_document_for_employee(task: ExtendedTask, document_id: int,
                                    no_detect: bool):
        detect = not no_detect
        document = Document.objects.get(pk=document_id)

        task.log_info('Process employment document: #{}. {}'.format(
            document_id, document.name))

        if detect and not is_employment_doc(document.full_text
                                            or document.text):
            task.log_info('Not an employment document: #{}. {}'.format(
                document_id, document.name))
            return

        employee_dict = {}
        provisions = []

        for tu_id, paragraph_text in TextUnit.objects.filter(
                document_id=document_id,
                unit_type="paragraph").values_list('id', 'textunittext__text'):
            # skip if all text in uppercase
            if paragraph_text == paragraph_text.upper():
                continue
            try:
                sentences = segment_sentences(paragraph_text)
            except:
                # accept the paragraph is a sentence if segmenter errors out.
                sentences = [paragraph_text]
            for text in sentences:

                # clean
                text = text.replace('[', '(').replace(']', ')')

                # get values not yet found. This logic assumes only one of each
                # of these values found per document.
                # if there is more than one it will only pick up the first (except effective date)
                if employee_dict.get('name') is None:
                    employee_dict['name'] = get_employee_name(text)
                if employee_dict.get('employer') is None:
                    employee_dict['employer'] = get_employer_name(text)
                if employee_dict.get('annual_salary') is None:
                    get_salary_result = get_salary(text)
                    if get_salary_result is not None:
                        employee_dict['annual_salary'] = get_salary_result[0][
                            0] * get_salary_result[1]
                        employee_dict['salary_currency'] = get_salary_result[
                            0][1]
                if employee_dict.get('effective_date') is None:
                    employee_dict['effective_date'] = get_effective_date(text)
                if employee_dict.get('vacation') is None:
                    get_vacation_result = get_vacation_duration(text)
                    if get_vacation_result is not None:
                        yearly_amount = get_vacation_result[0][
                            1] * get_vacation_result[1]
                        employee_dict['vacation'] = str(
                            yearly_amount) + " " + str(
                                get_vacation_result[0][0]) + "s"
                if employee_dict.get('governing_geo') is None:
                    employee_dict['governing_geo'] = get_governing_geo(text)

            non_compete_similarity = get_similar_to_non_compete(paragraph_text)
            if non_compete_similarity > .5:
                provisions.append({
                    "text_unit": tu_id,
                    "similarity": non_compete_similarity,
                    "type": "noncompete"
                })

                termination_similarity = get_similar_to_termination(
                    paragraph_text)
                if termination_similarity > .5:
                    provisions.append({
                        "text_unit": tu_id,
                        "similarity": termination_similarity,
                        "type": "termination"
                    })

                benefits_similarity = get_similar_to_benefits(paragraph_text)
                if benefits_similarity > .5:
                    provisions.append({
                        "text_unit": tu_id,
                        "similarity": benefits_similarity,
                        "type": "benefits"
                    })
                severance_similarity = get_similar_to_severance(paragraph_text)
                if severance_similarity > .5:
                    provisions.append({
                        "text_unit": tu_id,
                        "similarity": severance_similarity,
                        "type": "severance"
                    })

        employee = employer = None
        # create Employee only if his/her name exists
        if employee_dict.get('name') is not None:
            employee, ee_created = Employee.objects.get_or_create(
                name=employee_dict['name'],
                annual_salary=employee_dict.get('annual_salary'),
                salary_currency=employee_dict.get('salary_currency'),
                effective_date=employee_dict.get('effective_date'),
                vacation_yearly=employee_dict.get('vacation'),
                governing_geo=employee_dict.get('governing_geo'),
                document=Document.objects.get(pk=document_id))

        if len(provisions) > 0 and employee is not None:
            noncompete_found = termination_found = \
                severance_found = benefits_found = False

            for i in provisions:
                if i["type"] == "noncompete":
                    noncompete_found = True
                else:
                    if i["type"] == "termination":
                        termination_found = True
                    else:
                        if i["type"] == "benefits":
                            benefits_found = True
                        else:
                            if i["type"] == "severance":
                                severance_found = True
                Provision.objects.get_or_create(
                    text_unit_id=i["text_unit"],
                    similarity=i["similarity"],
                    employee=employee,
                    document=Document.objects.get(pk=document_id),
                    type=i["type"])
            employee.has_noncompete = noncompete_found
            employee.has_termination = termination_found
            employee.has_benefits = benefits_found
            employee.has_severance = severance_found
            employee.save()

        # create Employer
        if employee and employee_dict.get('employer') is not None:
            employer, er_created = Employer.objects.get_or_create(
                name=employee_dict['employer'])

        if employee and employer and not employee.employer:
            employee.employer = employer
            employee.save()
Ejemplo n.º 30
0
 def finalize(_self: ExtendedTask):
     _self.log_info('Cleanup redis keys')
     for k in redis.list_keys(f'{_self.main_task_id}_*'):
         redis.r.delete(k)
         _self.log_info(f'Deleted redis key "{k}"')