for span in sentence_spans:
            sentence = text[span[0]:span[1]]
            annotated_fields = set()
            added = False
            if annotations:
                for a in annotations:
                    if a.document_field \
                            and a.start_offset <= span[1] and span[0] <= a.end_offset:
                        field_code = a.document_field.pk
                        add(field_code, sentence)
                        annotated_fields.add(field_code)
                        added = True

            for field_config in field_configs.values():
                if field_config.field_code not in annotated_fields \
                        and field_config.sentence_matches_field_detectors(sentence):
                    add(field_config.field_code, sentence)
                    added = True
            if not added:
                add('', sentence)
            task.log_info('Processed {0} sentences of document {1}'.format(
                len(sentence_spans), doc.pk))

        if retrain_model:
            TrainFieldDetectorModel.train_model_for_document_class.apply_async(
                args=(document_class_name, ))


app.register_task(BuildFieldDetectorDataset())
app.register_task(TrainFieldDetectorModel())
Beispiel #2
0
    def process(self,
                document_type: Dict,
                document_field: Dict,
                config_csv_file: Dict,
                drop_previous_field_detectors: bool,
                update_field_choice_values: bool,
                **kwargs):
        try:
            self.log_info('Going to configure simple field detection config...')
            document_type = DocumentType.objects.get(pk=document_type['pk'])
            document_field = DocumentField.objects.get(pk=document_field['pk'])
            csv_bytes = DbCache.get(config_csv_file['cache_key'])
            apply_simple_config(CeleryTaskLogger(self),
                                document_field,
                                document_type,
                                csv_bytes,
                                drop_previous_field_detectors,
                                update_field_choice_values)
        finally:
            DbCache.clean_cache(config_csv_file['cache_key'])


app.register_task(DetectFieldValues())
app.register_task(TrainDocumentFieldDetectorModel())
app.register_task(TrainDirtyDocumentFieldDetectorModel())
app.register_task(CacheDocumentFields())
app.register_task(TrainAndTest())
app.register_task(LoadDocumentWithFields())
app.register_task(ImportSimpleFieldDetectionConfig())
            pass

    @classmethod
    def estimate_reaching_limit(cls, data) -> Tuple[int, int]:
        cluster_target = 'documents' if data.get(
            'do_cluster_documents') else 'units'
        project = data.get('project')
        proj_id = project.id if project else None
        from apps.analyze.app_vars import NOTIFY_TOO_MANY_DOCUMENTS, NOTIFY_TOO_MANY_UNITS
        if cluster_target == 'documents':
            query = Document.objects.all()
            if proj_id:
                query = query.filter(project_id=proj_id)
            count = query.count()
            count_limit = NOTIFY_TOO_MANY_DOCUMENTS.val  # 1669 might be (not necessarily) too much
        else:
            query = TextUnit.objects.all()
            if proj_id:
                query = query.filter(document__project_id=proj_id)
            count = query.count()
            count_limit = NOTIFY_TOO_MANY_UNITS.val  # 2 753 672 is definitely too much

        return count, count_limit


app.register_task(BuildFeatureVectorsTask())
app.register_task(TrainDoc2VecModel())
app.register_task(TrainClassifier())
app.register_task(RunClassifier())
app.register_task(Cluster())
                            not_sources.append(not_src)
                        try:
                            notifications = NotificationRenderer.render_notification_pack(
                                [m.package_id for m in msg_pack],
                                sub, not_sources)
                            notifications_to_send += notifications
                        except Exception as e:
                            log.error(f'Error in send_notifications_packet(), '
                                      f'sending render_notification_pack()', exc_info=e)

        log.info(f'notification.send({len(notifications_to_send)})')
        for notification in notifications_to_send:
            notification.send(log=log)


app.register_task(SendDigest())


def format_values_difference(field_code: str, old_value, new_value) -> str:
    tp = old_value.__class__.__name__ if old_value is not None \
        else new_value.__class__.__name__ if new_value is not None \
        else 'None'
    return '%s (%s): [%s], [%s]' % (field_code, tp, str(old_value), str(new_value))


def values_look_equal(a, b) -> bool:
    if a == b:
        return True

    if (isinstance(a, str) and not a and not b) or (isinstance(b, str) and not b and not a):
        return True
                                      .format(document_type.code, field.code))
                        field.save(update_fields={'hide_until_js'})

                if len(changed_field_codes) > 0 and document_type.field_code_aliases:
                    updated_aliases = {k: changed_field_codes.get(v) or v
                                       for k, v in document_type.field_code_aliases.items()}
                    self.log_info('Updating field code aliases of document type {0}"\n{1}'
                                  .format(document_type.code, updated_aliases))
                    document_type.field_code_aliases = updated_aliases
                    document_type.save(update_fields={'field_code_aliases'})

        output = io.StringIO()
        writer = csv.writer(output)
        writer.writerow(('Document Type', 'Old Field Code', 'New Field Code'))
        for r in csv_log:
            writer.writerow(r)
        self.log_info('\n\n\n------------------\n'
                      'Changed fields csv:\n' + output.getvalue() + '\n------------------')


app.register_task(DetectFieldValues())
app.register_task(TrainDocumentFieldDetectorModel())
app.register_task(TrainDirtyDocumentFieldDetectorModel())
app.register_task(CacheDocumentFields())
app.register_task(TrainAndTest())
app.register_task(LoadDocumentWithFields())
app.register_task(ImportCSVFieldDetectionConfig())
app.register_task(FindBrokenDocumentFieldValues())
app.register_task(ImportDocumentType())
app.register_task(FixDocumentFieldCodes())
Beispiel #6
0
            if alt_source_path:
                document = Document.objects.get(id=document_id)
                document.alt_source_path = alt_source_path
                document.save()

    @staticmethod
    @shared_task(base=ExtendedTask,
                 bind=True,
                 soft_time_limit=6000,
                 default_retry_delay=10,
                 retry_backoff=True,
                 autoretry_for=(SoftTimeLimitExceeded, InterfaceError, OperationalError,),
                 max_retries=3,
                 priority=9)
    def notify_task_completed(task: ExtendedTask,
                              project_id: int,
                              document_ids: List[int],
                              user_id: int):
        # TODO: if needed - f.e. email a user
        task.log_info(f'Notification sent to user id={user_id}')


app.register_task(ClusterProjectDocuments())
app.register_task(ReassignProjectClusterDocuments())
app.register_task(CleanProject())
app.register_task(CleanProjects())
app.register_task(CancelUpload())
app.register_task(LoadArchive())
app.register_task(SetAnnotationsStatus())
app.register_task(CreateSearchablePDF())
Beispiel #7
0
                    continue
                doc_ids_to_values[doc_a_pk].add(doc_b_pk)
                doc_ids_to_values[doc_b_pk].add(doc_a_pk)
            if x % 100 == 0:
                self.log_info('{field}: Checked for similarity {x} documents of {n}'
                              .format(field=dst_field.code, x=x + 1, n=total_docs))
                self.push()

        self.push()
        self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.'
                      .format(field=dst_field.code, n=len(doc_ids_to_values)))

        doc_ids_to_values = {doc_id: list(v) if v else None for doc_id, v in doc_ids_to_values}
        field_repo.store_values_one_field_many_docs_no_ants(field=dst_field, doc_ids_to_values=doc_ids_to_values)

        log = CeleryTaskLogger(self)
        for doc_id in doc_ids_to_values.keys():
            try:
                doc = Document.objects.get(pk=doc_id)
                signals.fire_document_changed(log=log, document=doc, changed_by_user=None, system_fields_changed=False,
                                              generic_fields_changed=False, user_fields_changed=[dst_field.code])
            except Exception as ex:
                self.log_error(f'Unable to fire doc id change event for doc #{doc_id}', exc_info=ex)


app.register_task(PreconfiguredDocumentSimilaritySearch())
app.register_task(Similarity())
app.register_task(SimilarityByFeatures())
app.register_task(ChunkSimilarity())
app.register_task(PartySimilarity())
UNREADY_STATE_TUPLE = tuple(UNREADY_STATES)


@shared_task(base=ExtendedTask,
             bind=True,
             soft_time_limit=600,
             default_retry_delay=10,
             retry_backoff=True,
             autoretry_for=(SoftTimeLimitExceeded, InterfaceError, OperationalError),
             max_retries=0)
def trigger_imanage_sync(_task: ExtendedTask):
    # SQL: Return 1 if there are enabled imanage configs last time processed too long ago
    # and there are no sync tasks pending.
    sql = SQLClause('''select case when ( 
    exists (select * from "{table_name}" where enabled = True 
    and (last_sync_start is null 
    or (last_sync_start + (sync_frequency_minutes::text||\' minute\')::INTERVAL) <= now()) 
    limit 1)
    and not exists (select * from "{task_table_name}" where name = %s and status in %s) 
    ) then 1 else 0 end   
    '''.format(table_name=IManageConfig._meta.db_table, task_table_name=Task._meta.db_table),
                    [IManageSynchronization.name, UNREADY_STATE_TUPLE])

    with connection.cursor() as cursor:
        if fetch_int(cursor, sql):
            call_task(IManageSynchronization.name, auto=True, module_name='apps.imanage_integration.tasks')


app.register_task(IManageSynchronization())
Beispiel #9
0
                if doc_a_pk == doc_b_pk:
                    continue
                similarity = similarities[0, y]
                if similarity < similarity_threshold:
                    continue
                dfvs.append(DocumentFieldValue(document_id=doc_a_pk, value=doc_b_pk, field_id=dst_field.pk))
                dfvs.append(DocumentFieldValue(document_id=doc_b_pk, value=doc_a_pk, field_id=dst_field.pk))
            if x % 100 == 0:
                self.log_info('{field}: Checked for similarity {x} documents of {n}'
                              .format(field=dst_field.code, x=x + 1, n=total_docs))
                self.push()

        self.push()
        self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.'
                      .format(field=dst_field.code, n=len(dfvs)))

        del_doc_batch_size = 100
        for i in range(0, len(field_values_list), del_doc_batch_size):
            DocumentFieldValue.objects \
                .filter(field_id=dst_field.pk) \
                .filter(document_id__in={field_values[FIELD_CODE_DOC_ID] for field_values
                                         in field_values_list[i: i + del_doc_batch_size]}) \
                .delete()
        DocumentFieldValue.objects.bulk_create(dfvs)
        self.push()


app.register_task(PreconfiguredDocumentSimilaritySearch())
app.register_task(Similarity())
app.register_task(PartySimilarity())
Beispiel #10
0
def init_method_stats_collectors(_celery_task):
    """
    Initiate collect_stats decorators on system start
    """
    for instance_values in MethodStatsCollectorPlugin.objects.values():
        decorate(collect_stats, **instance_values)


class ReindexDB(ExtendedTask):
    """
    Reindex DB and run VACUUM ANALYZE
    """
    name = 'Reindex DB'
    priority = 7

    def process(self, **kwargs):
        do_reindex = kwargs.get('reindex')
        do_vacuum = kwargs.get('vacuum')

        if do_reindex:
            with connection.cursor() as cursor:
                cursor.execute('REINDEX DATABASE {};'.format(
                    settings.DATABASES['default']['NAME']))

        if do_vacuum:
            with connection.cursor() as cursor:
                cursor.execute('VACUUM ANALYZE;')


app.register_task(ReindexDB())
Beispiel #11
0
                self.log_info('Start extracting {} documents from {}'.format(
                    len(tar_file_members), local_file_path))

                for n, a_file in enumerate(tar_file_members):
                    if a_file.isdir():
                        continue
                    file_size = a_file.size
                    file_name = os.path.basename(a_file.name)
                    mime_type = self.get_mime_type(file_name)

                    self.log_info(
                        'Extract/start LoadDocument for {} of {} files: name={}, size={}, mime_type={}'.format(
                            n + 1, len(tar_file_members), file_name, file_size, mime_type))

                    with TemporaryUploadedFile(file_name, mime_type, file_size, 'utf-8') as tempfile:
                        tempfile.file = tar_file.extractfile(a_file)

                        self.upload_file(
                            file_name=file_name,
                            file_size=file_size,
                            contents=tempfile,
                            directory_path=self.directory_path)


app.register_task(ClusterProjectDocuments())
app.register_task(ReassignProjectClusterDocuments())
app.register_task(CleanProject())
app.register_task(CleanProjects())
app.register_task(CancelUpload())
app.register_task(LoadArchive())
Beispiel #12
0
    def cache_document_fields_for_doc_ids(_task: ExtendedTask, doc_ids: Set):
        for doc in Document.objects.filter(pk__in=doc_ids):
            doc.cache_generic_values()
            doc.cache_field_values()

    def process(self, project: Project = None, **_kwargs):
        document_qs = Document.objects

        if project:
            document_qs = document_qs.filter(project__pk=project['pk'])

        doc_id_pack = set()
        for doc_id in document_qs.values_list('pk', flat=True):
            doc_id_pack.add(doc_id)
            if len(doc_id_pack) >= 10:
                self.run_sub_tasks('Cache field values for a set of documents',
                                   self.cache_document_fields_for_doc_ids,
                                   [(doc_id_pack, )])
                doc_id_pack = set()
        if len(doc_id_pack) > 0:
            self.run_sub_tasks('Cache field values for a set of documents',
                               self.cache_document_fields_for_doc_ids,
                               [(doc_id_pack, )])


app.register_task(DetectFieldValues())
app.register_task(TrainDocumentFieldDetectorModel())
app.register_task(TrainDirtyDocumentFieldDetectorModel())
app.register_task(CacheDocumentFields())
app.register_task(TrainDocumentField())
        classifier_model.set_trained_model_obj(model)
        classifier_model.save()
        parent_task.log_info(
            'Finished training model for document_type #{0} and field #{1}. '
            'Total number of samples: {2}'.format(document_type_uid, field_uid,
                                                  total_samples))

        if trigger_re_detecting_field_values:

            detect_field_values_for_document_args = []

            document_ids = Document.objects.filter(
                document_type=document_type).values_list('pk', flat=True)
            parent_task.update_subtasks_total(len(document_ids))

            for document_id in document_ids:
                detect_field_values_for_document_args.append(
                    (document_id, False, field_uid))
            parent_task.run_sub_tasks(
                'Detect Values of Field {0} for Each Document'.format(
                    field.code),
                DetectFieldValues.detect_field_values_for_document,
                detect_field_values_for_document_args)
        else:
            parent_task.force_complete()


app.register_task(DetectFieldValues())
app.register_task(TrainDocumentFieldDetectorModel())
MODULE_NAME = __name__


class SyncDocTermUsageModel(BaseTask):
    name = 'Synchronize Document Term Usage Summary'
    priority = 1

    SQL_INSERT = '''
    INSERT INTO "extract_documenttermusage" ("document_id", "term_id", "count")
    SELECT "document_textunit"."document_id", "extract_termusage"."term_id", 
        SUM("extract_termusage"."count") AS "count" 
    FROM "extract_termusage" 
    INNER JOIN "document_textunit" ON ("extract_termusage"."text_unit_id" = "document_textunit"."id") 
    GROUP BY "document_textunit"."document_id", "extract_termusage"."term_id" ORDER BY "count" DESC;
    '''

    SQL_CLEAR = 'DELETE FROM "extract_documenttermusage";'

    def process(self, **kwargs):
        self.log_info('Clearing Document Term Usage ...')
        with connection.cursor() as cursor:
            cursor.execute(self.SQL_CLEAR)

        self.log_info('Updating Document Term Usage ...')
        with connection.cursor() as cursor:
            cursor.execute(self.SQL_INSERT)


app.register_task(SyncDocTermUsageModel())
        # prohibited use
        doc.prohibited_uses = ProcessLeaseDocuments.ordered_list_without_repetitions(
            fields.get('prohibited_use__list'))
        renew_duration_tuple = fields.get('renew_non_renew_notice')
        if renew_duration_tuple:
            doc.renew_non_renew_notice_duration = timedelta(days=renew_duration_tuple[2])

        auto_renew = fields.get('auto_renew')
        if auto_renew is not None:
            doc.auto_renew = auto_renew

        area_square_feet_list = fields.get('area_square_feet__list')
        if area_square_feet_list:
            doc.area_size_sq_ft = area_square_feet_list[0]

        doc.alterations_allowed = ProcessLeaseDocuments.ordered_list_without_repetitions(
            fields.get('alterations_allowed__list'))

        security_deposit = fields.get('security_deposit__set')
        if security_deposit:
            doc.security_deposit = max(security_deposit)

        doc.rent_due_frequency = fields.get('rent_due_frequency')

        mean_rent_per_month = fields.get('mean_rent_per_month__set')
        if mean_rent_per_month:
            doc.mean_rent_per_month = max(mean_rent_per_month)


app.register_task(ProcessLeaseDocuments())
Beispiel #16
0
        # store data about cleanup in ProjectCleanup Task
        task_model = self.task
        task_model.metadata = {
            'task_name': 'clean-project',
            '_project_id':
            project_id  # added "_" to avoid detecting task as project task
        }
        task_model.save()


@app.task(name='advanced_celery.track_session_completed', bind=True)
def track_session_completed(*args, **kwargs):
    """
    Filter sessions where users were notified that upload job started
    i.e. a user set "send email notifications" flag,
    filter sessions where users were not notified that a session job is completed and
    check that upload job is completed,
    send notification email.
    """
    TaskUtils.prepare_task_execution()

    for session in UploadSession.objects.filter(
            notified_upload_started=True, notified_upload_completed=False):
        if session.is_completed():
            session.notify_upload_completed()


app.register_task(ClusterProjectDocuments())
app.register_task(ReassignProjectClusterDocuments())
app.register_task(CleanProject())
                        termination_found = True
                    else:
                        if i["type"] == "benefits":
                            benefits_found = True
                        else:
                            if i["type"] == "severance":
                                severance_found = True
                Provision.objects.get_or_create(
                    text_unit=TextUnit.objects.get(pk=i["text_unit"]),
                    similarity=i["similarity"],
                    employee=employee,
                    document=Document.objects.get(pk=document_id),
                    type=i["type"])
            employee.has_noncompete = noncompete_found
            employee.has_termination = termination_found
            employee.has_benefits = benefits_found
            employee.has_severance = severance_found
            employee.save()

        # create Employer
        if employee and employee_dict.get('employer') is not None:
            employer, er_created = Employer.objects.get_or_create(
                name=employee_dict['employer'])

        if employee and employer and not employee.employer:
            employee.employer = employer
            employee.save()


app.register_task(LocateEmployees())
Beispiel #18
0
        source = kwargs.get('source')

        self.log_info('Going to train doc2vec model from {} objects...'.format(
            source.upper()))

        transformer_name = kwargs.get('transformer_name')
        project_ids = kwargs.get('project_ids')

        vector_size = kwargs.get('vector_size')
        window = kwargs.get('window')
        min_count = kwargs.get('min_count')
        dm = kwargs.get('dm')

        transformer = Doc2VecTransformer(vector_size=vector_size,
                                         window=window,
                                         min_count=min_count,
                                         dm=dm)

        model_builder_args = dict(project_ids=project_ids,
                                  transformer_name=transformer_name)
        if source == 'document':
            model_builder = transformer.build_doc2vec_document_model
        else:
            model_builder = transformer.build_doc2vec_text_unit_model
            model_builder_args['text_unit_type'] = kwargs.get('text_unit_type')

        model_builder(**model_builder_args)


app.register_task(TrainDoc2VecModel())