Beispiel #1
0
    def get_train_values(
            cls, field: DocumentField, train_data_project_ids: Optional[List],
            field_values_only: Set[str],
            use_only_confirmed_field_values: bool) -> List[Dict[str, Any]]:
        repo = DocumentFieldRepository()
        fd_repo = FieldDetectionRepository()

        if train_data_project_ids and not use_only_confirmed_field_values:
            return [
                field_values for doc_id, field_values in
                repo.get_field_code_to_python_value_multiple_docs(
                    document_type_id=field.document_type_id,
                    project_ids=train_data_project_ids,
                    doc_limit=settings.ML_TRAIN_DATA_SET_GROUP_LEN)
                if field_values.get(field.code) in field_values_only
            ]
        else:
            qs_modified_document_ids = fd_repo.get_qs_active_modified_document_ids(
                field, train_data_project_ids)

            qs_finished_document_ids = fd_repo.get_qs_finished_document_ids(
                field.document_type, train_data_project_ids)

            qs_train_doc_ids = qs_modified_document_ids.union(
                qs_finished_document_ids)
            return [
                field_values for _doc_id, field_values in
                repo.get_field_code_to_python_value_multiple_docs(
                    document_type_id=field.document_type_id,
                    doc_ids=qs_train_doc_ids,
                    doc_limit=settings.ML_TRAIN_DATA_SET_GROUP_LEN)
                if field_values.get(field.code) in field_values_only
            ]
Beispiel #2
0
    def get_user_data(cls, field: DocumentField,
                      project_ids: Optional[List[str]]) -> List[dict]:
        fd_repo = FieldDetectionRepository()
        qs_modified_document_ids = fd_repo.get_qs_active_modified_document_ids(field, project_ids)

        qs_finished_document_ids = fd_repo.get_qs_finished_document_ids(field.document_type, project_ids)
        return FieldAnnotation.objects.filter(Q(field=field),
                                              Q(text_unit__isnull=False),
                                              Q(document__in=Subquery(qs_modified_document_ids))
                                              | Q(document__in=Subquery(qs_finished_document_ids))) \
                   .values('modified_by', 'text_unit__textunittext__text', 'value', 'extraction_hint') \
                   .order_by('modified_by')[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]
Beispiel #3
0
 def train_model_for_dirty_field(task: ExtendedTask, dirty_field_id: Any) -> None:
     dirty_field = DocumentField.objects.get(pk=dirty_field_id)
     if dirty_field.can_retrain():
         dirty_field.dirty = False
         dirty_field.save()
         fd_repo = FieldDetectionRepository()
         train_docs_count = fd_repo.get_approved_documents_number(dirty_field, None)
         if train_docs_count >= dirty_field.trained_after_documents_number:
             new_model = field_detection.train_document_field_detector_model(CeleryTaskLogger(task),
                                                                             dirty_field,
                                                                             None)
             if new_model:
                 ClassifierModel.objects.filter(document_field=dirty_field).delete()
                 new_model.save()
Beispiel #4
0
    def process(self, **kwargs):
        self.log_info(
            'Going to train document field based on the datasets stored in DB...')

        document_field_id = kwargs.get('document_field_id')
        skip_training = kwargs.get('skip_training')
        use_only_confirmed_field_values_for_training = kwargs.get('use_only_confirmed_field_values_for_training')
        train_data_project_ids = kwargs.get('train_data_project_ids')

        skip_testing = kwargs.get('skip_testing')
        use_only_confirmed_field_values_for_testing = kwargs.get('use_only_confirmed_field_values_for_testing')
        test_data_projects_ids = kwargs.get('test_data_projects_ids')

        field = DocumentField.objects.get(pk=document_field_id)

        if not field.is_detectable():
            self.log_info('Field {0} is not detectable. Nothing to train and/or test.'.format(field.code))

        new_model = None

        if not skip_training:
            if train_data_project_ids:
                self.log_info('Training model on the specified projects...')
            else:
                self.log_info('No training projects specified. '
                              'Training model on all user-confirmed field values in the system...')

            new_model = field_detection \
                .train_document_field_detector_model(CeleryTaskLogger(self),
                                                     field,
                                                     train_data_project_ids,
                                                     use_only_confirmed_field_values_for_training)
            if new_model:
                ClassifierModel.objects.filter(document_field=field).delete()
                new_model.save()

                if new_model.classifier_accuracy_report_in_sample:
                    self.log_info('Sklearn test report for in-sample docs:\n{0}'
                                  .format(new_model.classifier_accuracy_report_in_sample))

                if new_model.classifier_accuracy_report_out_of_sample:
                    self.log_info('Sklearn test report for out-of-sample docs:\n{0}'
                                  .format(new_model.classifier_accuracy_report_out_of_sample))
            else:
                self.log_info('No model trained. '
                              'Probably the detection strategy of field {0} does not allow training'.format(field.code))

        if skip_testing:
            return

        if not test_data_projects_ids:
            self.log_info('No test projects specified. Skiping the testing step.')
            return
        else:
            if not use_only_confirmed_field_values_for_testing:
                test_document_ids = Document.objects \
                    .filter(project_id__in=test_data_projects_ids, document_type_id=field.document_type.pk) \
                    .values_list('pk', flat=True)
            else:
                fd_repo = FieldDetectionRepository()
                test_document_ids = set(fd_repo.get_qs_active_modified_document_ids(field,
                                                                                    test_data_projects_ids))
                test_document_ids.update(set(fd_repo.get_qs_finished_document_ids(field.document_type,
                                                                                  test_data_projects_ids)))

            self.log_info('Testing field detection document-by-document...')
            test_tasks_args = []
            for test_document_id in test_document_ids:
                test_tasks_args.append((field.uid, test_document_id))

            if test_tasks_args:
                self.run_sub_tasks('Test Field Detector Model', TrainAndTest.test_field_detector_model,
                                   test_tasks_args)
                args_list = [(field.uid, new_model.pk if new_model else None)]
                self.run_after_sub_tasks_finished('Join Field Detector Model Tests',
                                                  TrainAndTest.join_field_detector_model_tests,
                                                  args_list)