for span in sentence_spans: sentence = text[span[0]:span[1]] annotated_fields = set() added = False if annotations: for a in annotations: if a.document_field \ and a.start_offset <= span[1] and span[0] <= a.end_offset: field_code = a.document_field.pk add(field_code, sentence) annotated_fields.add(field_code) added = True for field_config in field_configs.values(): if field_config.field_code not in annotated_fields \ and field_config.sentence_matches_field_detectors(sentence): add(field_config.field_code, sentence) added = True if not added: add('', sentence) task.log_info('Processed {0} sentences of document {1}'.format( len(sentence_spans), doc.pk)) if retrain_model: TrainFieldDetectorModel.train_model_for_document_class.apply_async( args=(document_class_name, )) app.register_task(BuildFieldDetectorDataset()) app.register_task(TrainFieldDetectorModel())
def process(self, document_type: Dict, document_field: Dict, config_csv_file: Dict, drop_previous_field_detectors: bool, update_field_choice_values: bool, **kwargs): try: self.log_info('Going to configure simple field detection config...') document_type = DocumentType.objects.get(pk=document_type['pk']) document_field = DocumentField.objects.get(pk=document_field['pk']) csv_bytes = DbCache.get(config_csv_file['cache_key']) apply_simple_config(CeleryTaskLogger(self), document_field, document_type, csv_bytes, drop_previous_field_detectors, update_field_choice_values) finally: DbCache.clean_cache(config_csv_file['cache_key']) app.register_task(DetectFieldValues()) app.register_task(TrainDocumentFieldDetectorModel()) app.register_task(TrainDirtyDocumentFieldDetectorModel()) app.register_task(CacheDocumentFields()) app.register_task(TrainAndTest()) app.register_task(LoadDocumentWithFields()) app.register_task(ImportSimpleFieldDetectionConfig())
pass @classmethod def estimate_reaching_limit(cls, data) -> Tuple[int, int]: cluster_target = 'documents' if data.get( 'do_cluster_documents') else 'units' project = data.get('project') proj_id = project.id if project else None from apps.analyze.app_vars import NOTIFY_TOO_MANY_DOCUMENTS, NOTIFY_TOO_MANY_UNITS if cluster_target == 'documents': query = Document.objects.all() if proj_id: query = query.filter(project_id=proj_id) count = query.count() count_limit = NOTIFY_TOO_MANY_DOCUMENTS.val # 1669 might be (not necessarily) too much else: query = TextUnit.objects.all() if proj_id: query = query.filter(document__project_id=proj_id) count = query.count() count_limit = NOTIFY_TOO_MANY_UNITS.val # 2 753 672 is definitely too much return count, count_limit app.register_task(BuildFeatureVectorsTask()) app.register_task(TrainDoc2VecModel()) app.register_task(TrainClassifier()) app.register_task(RunClassifier()) app.register_task(Cluster())
not_sources.append(not_src) try: notifications = NotificationRenderer.render_notification_pack( [m.package_id for m in msg_pack], sub, not_sources) notifications_to_send += notifications except Exception as e: log.error(f'Error in send_notifications_packet(), ' f'sending render_notification_pack()', exc_info=e) log.info(f'notification.send({len(notifications_to_send)})') for notification in notifications_to_send: notification.send(log=log) app.register_task(SendDigest()) def format_values_difference(field_code: str, old_value, new_value) -> str: tp = old_value.__class__.__name__ if old_value is not None \ else new_value.__class__.__name__ if new_value is not None \ else 'None' return '%s (%s): [%s], [%s]' % (field_code, tp, str(old_value), str(new_value)) def values_look_equal(a, b) -> bool: if a == b: return True if (isinstance(a, str) and not a and not b) or (isinstance(b, str) and not b and not a): return True
.format(document_type.code, field.code)) field.save(update_fields={'hide_until_js'}) if len(changed_field_codes) > 0 and document_type.field_code_aliases: updated_aliases = {k: changed_field_codes.get(v) or v for k, v in document_type.field_code_aliases.items()} self.log_info('Updating field code aliases of document type {0}"\n{1}' .format(document_type.code, updated_aliases)) document_type.field_code_aliases = updated_aliases document_type.save(update_fields={'field_code_aliases'}) output = io.StringIO() writer = csv.writer(output) writer.writerow(('Document Type', 'Old Field Code', 'New Field Code')) for r in csv_log: writer.writerow(r) self.log_info('\n\n\n------------------\n' 'Changed fields csv:\n' + output.getvalue() + '\n------------------') app.register_task(DetectFieldValues()) app.register_task(TrainDocumentFieldDetectorModel()) app.register_task(TrainDirtyDocumentFieldDetectorModel()) app.register_task(CacheDocumentFields()) app.register_task(TrainAndTest()) app.register_task(LoadDocumentWithFields()) app.register_task(ImportCSVFieldDetectionConfig()) app.register_task(FindBrokenDocumentFieldValues()) app.register_task(ImportDocumentType()) app.register_task(FixDocumentFieldCodes())
if alt_source_path: document = Document.objects.get(id=document_id) document.alt_source_path = alt_source_path document.save() @staticmethod @shared_task(base=ExtendedTask, bind=True, soft_time_limit=6000, default_retry_delay=10, retry_backoff=True, autoretry_for=(SoftTimeLimitExceeded, InterfaceError, OperationalError,), max_retries=3, priority=9) def notify_task_completed(task: ExtendedTask, project_id: int, document_ids: List[int], user_id: int): # TODO: if needed - f.e. email a user task.log_info(f'Notification sent to user id={user_id}') app.register_task(ClusterProjectDocuments()) app.register_task(ReassignProjectClusterDocuments()) app.register_task(CleanProject()) app.register_task(CleanProjects()) app.register_task(CancelUpload()) app.register_task(LoadArchive()) app.register_task(SetAnnotationsStatus()) app.register_task(CreateSearchablePDF())
continue doc_ids_to_values[doc_a_pk].add(doc_b_pk) doc_ids_to_values[doc_b_pk].add(doc_a_pk) if x % 100 == 0: self.log_info('{field}: Checked for similarity {x} documents of {n}' .format(field=dst_field.code, x=x + 1, n=total_docs)) self.push() self.push() self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.' .format(field=dst_field.code, n=len(doc_ids_to_values))) doc_ids_to_values = {doc_id: list(v) if v else None for doc_id, v in doc_ids_to_values} field_repo.store_values_one_field_many_docs_no_ants(field=dst_field, doc_ids_to_values=doc_ids_to_values) log = CeleryTaskLogger(self) for doc_id in doc_ids_to_values.keys(): try: doc = Document.objects.get(pk=doc_id) signals.fire_document_changed(log=log, document=doc, changed_by_user=None, system_fields_changed=False, generic_fields_changed=False, user_fields_changed=[dst_field.code]) except Exception as ex: self.log_error(f'Unable to fire doc id change event for doc #{doc_id}', exc_info=ex) app.register_task(PreconfiguredDocumentSimilaritySearch()) app.register_task(Similarity()) app.register_task(SimilarityByFeatures()) app.register_task(ChunkSimilarity()) app.register_task(PartySimilarity())
UNREADY_STATE_TUPLE = tuple(UNREADY_STATES) @shared_task(base=ExtendedTask, bind=True, soft_time_limit=600, default_retry_delay=10, retry_backoff=True, autoretry_for=(SoftTimeLimitExceeded, InterfaceError, OperationalError), max_retries=0) def trigger_imanage_sync(_task: ExtendedTask): # SQL: Return 1 if there are enabled imanage configs last time processed too long ago # and there are no sync tasks pending. sql = SQLClause('''select case when ( exists (select * from "{table_name}" where enabled = True and (last_sync_start is null or (last_sync_start + (sync_frequency_minutes::text||\' minute\')::INTERVAL) <= now()) limit 1) and not exists (select * from "{task_table_name}" where name = %s and status in %s) ) then 1 else 0 end '''.format(table_name=IManageConfig._meta.db_table, task_table_name=Task._meta.db_table), [IManageSynchronization.name, UNREADY_STATE_TUPLE]) with connection.cursor() as cursor: if fetch_int(cursor, sql): call_task(IManageSynchronization.name, auto=True, module_name='apps.imanage_integration.tasks') app.register_task(IManageSynchronization())
if doc_a_pk == doc_b_pk: continue similarity = similarities[0, y] if similarity < similarity_threshold: continue dfvs.append(DocumentFieldValue(document_id=doc_a_pk, value=doc_b_pk, field_id=dst_field.pk)) dfvs.append(DocumentFieldValue(document_id=doc_b_pk, value=doc_a_pk, field_id=dst_field.pk)) if x % 100 == 0: self.log_info('{field}: Checked for similarity {x} documents of {n}' .format(field=dst_field.code, x=x + 1, n=total_docs)) self.push() self.push() self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.' .format(field=dst_field.code, n=len(dfvs))) del_doc_batch_size = 100 for i in range(0, len(field_values_list), del_doc_batch_size): DocumentFieldValue.objects \ .filter(field_id=dst_field.pk) \ .filter(document_id__in={field_values[FIELD_CODE_DOC_ID] for field_values in field_values_list[i: i + del_doc_batch_size]}) \ .delete() DocumentFieldValue.objects.bulk_create(dfvs) self.push() app.register_task(PreconfiguredDocumentSimilaritySearch()) app.register_task(Similarity()) app.register_task(PartySimilarity())
def init_method_stats_collectors(_celery_task): """ Initiate collect_stats decorators on system start """ for instance_values in MethodStatsCollectorPlugin.objects.values(): decorate(collect_stats, **instance_values) class ReindexDB(ExtendedTask): """ Reindex DB and run VACUUM ANALYZE """ name = 'Reindex DB' priority = 7 def process(self, **kwargs): do_reindex = kwargs.get('reindex') do_vacuum = kwargs.get('vacuum') if do_reindex: with connection.cursor() as cursor: cursor.execute('REINDEX DATABASE {};'.format( settings.DATABASES['default']['NAME'])) if do_vacuum: with connection.cursor() as cursor: cursor.execute('VACUUM ANALYZE;') app.register_task(ReindexDB())
self.log_info('Start extracting {} documents from {}'.format( len(tar_file_members), local_file_path)) for n, a_file in enumerate(tar_file_members): if a_file.isdir(): continue file_size = a_file.size file_name = os.path.basename(a_file.name) mime_type = self.get_mime_type(file_name) self.log_info( 'Extract/start LoadDocument for {} of {} files: name={}, size={}, mime_type={}'.format( n + 1, len(tar_file_members), file_name, file_size, mime_type)) with TemporaryUploadedFile(file_name, mime_type, file_size, 'utf-8') as tempfile: tempfile.file = tar_file.extractfile(a_file) self.upload_file( file_name=file_name, file_size=file_size, contents=tempfile, directory_path=self.directory_path) app.register_task(ClusterProjectDocuments()) app.register_task(ReassignProjectClusterDocuments()) app.register_task(CleanProject()) app.register_task(CleanProjects()) app.register_task(CancelUpload()) app.register_task(LoadArchive())
def cache_document_fields_for_doc_ids(_task: ExtendedTask, doc_ids: Set): for doc in Document.objects.filter(pk__in=doc_ids): doc.cache_generic_values() doc.cache_field_values() def process(self, project: Project = None, **_kwargs): document_qs = Document.objects if project: document_qs = document_qs.filter(project__pk=project['pk']) doc_id_pack = set() for doc_id in document_qs.values_list('pk', flat=True): doc_id_pack.add(doc_id) if len(doc_id_pack) >= 10: self.run_sub_tasks('Cache field values for a set of documents', self.cache_document_fields_for_doc_ids, [(doc_id_pack, )]) doc_id_pack = set() if len(doc_id_pack) > 0: self.run_sub_tasks('Cache field values for a set of documents', self.cache_document_fields_for_doc_ids, [(doc_id_pack, )]) app.register_task(DetectFieldValues()) app.register_task(TrainDocumentFieldDetectorModel()) app.register_task(TrainDirtyDocumentFieldDetectorModel()) app.register_task(CacheDocumentFields()) app.register_task(TrainDocumentField())
classifier_model.set_trained_model_obj(model) classifier_model.save() parent_task.log_info( 'Finished training model for document_type #{0} and field #{1}. ' 'Total number of samples: {2}'.format(document_type_uid, field_uid, total_samples)) if trigger_re_detecting_field_values: detect_field_values_for_document_args = [] document_ids = Document.objects.filter( document_type=document_type).values_list('pk', flat=True) parent_task.update_subtasks_total(len(document_ids)) for document_id in document_ids: detect_field_values_for_document_args.append( (document_id, False, field_uid)) parent_task.run_sub_tasks( 'Detect Values of Field {0} for Each Document'.format( field.code), DetectFieldValues.detect_field_values_for_document, detect_field_values_for_document_args) else: parent_task.force_complete() app.register_task(DetectFieldValues()) app.register_task(TrainDocumentFieldDetectorModel())
MODULE_NAME = __name__ class SyncDocTermUsageModel(BaseTask): name = 'Synchronize Document Term Usage Summary' priority = 1 SQL_INSERT = ''' INSERT INTO "extract_documenttermusage" ("document_id", "term_id", "count") SELECT "document_textunit"."document_id", "extract_termusage"."term_id", SUM("extract_termusage"."count") AS "count" FROM "extract_termusage" INNER JOIN "document_textunit" ON ("extract_termusage"."text_unit_id" = "document_textunit"."id") GROUP BY "document_textunit"."document_id", "extract_termusage"."term_id" ORDER BY "count" DESC; ''' SQL_CLEAR = 'DELETE FROM "extract_documenttermusage";' def process(self, **kwargs): self.log_info('Clearing Document Term Usage ...') with connection.cursor() as cursor: cursor.execute(self.SQL_CLEAR) self.log_info('Updating Document Term Usage ...') with connection.cursor() as cursor: cursor.execute(self.SQL_INSERT) app.register_task(SyncDocTermUsageModel())
# prohibited use doc.prohibited_uses = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('prohibited_use__list')) renew_duration_tuple = fields.get('renew_non_renew_notice') if renew_duration_tuple: doc.renew_non_renew_notice_duration = timedelta(days=renew_duration_tuple[2]) auto_renew = fields.get('auto_renew') if auto_renew is not None: doc.auto_renew = auto_renew area_square_feet_list = fields.get('area_square_feet__list') if area_square_feet_list: doc.area_size_sq_ft = area_square_feet_list[0] doc.alterations_allowed = ProcessLeaseDocuments.ordered_list_without_repetitions( fields.get('alterations_allowed__list')) security_deposit = fields.get('security_deposit__set') if security_deposit: doc.security_deposit = max(security_deposit) doc.rent_due_frequency = fields.get('rent_due_frequency') mean_rent_per_month = fields.get('mean_rent_per_month__set') if mean_rent_per_month: doc.mean_rent_per_month = max(mean_rent_per_month) app.register_task(ProcessLeaseDocuments())
# store data about cleanup in ProjectCleanup Task task_model = self.task task_model.metadata = { 'task_name': 'clean-project', '_project_id': project_id # added "_" to avoid detecting task as project task } task_model.save() @app.task(name='advanced_celery.track_session_completed', bind=True) def track_session_completed(*args, **kwargs): """ Filter sessions where users were notified that upload job started i.e. a user set "send email notifications" flag, filter sessions where users were not notified that a session job is completed and check that upload job is completed, send notification email. """ TaskUtils.prepare_task_execution() for session in UploadSession.objects.filter( notified_upload_started=True, notified_upload_completed=False): if session.is_completed(): session.notify_upload_completed() app.register_task(ClusterProjectDocuments()) app.register_task(ReassignProjectClusterDocuments()) app.register_task(CleanProject())
termination_found = True else: if i["type"] == "benefits": benefits_found = True else: if i["type"] == "severance": severance_found = True Provision.objects.get_or_create( text_unit=TextUnit.objects.get(pk=i["text_unit"]), similarity=i["similarity"], employee=employee, document=Document.objects.get(pk=document_id), type=i["type"]) employee.has_noncompete = noncompete_found employee.has_termination = termination_found employee.has_benefits = benefits_found employee.has_severance = severance_found employee.save() # create Employer if employee and employee_dict.get('employer') is not None: employer, er_created = Employer.objects.get_or_create( name=employee_dict['employer']) if employee and employer and not employee.employer: employee.employer = employer employee.save() app.register_task(LocateEmployees())
source = kwargs.get('source') self.log_info('Going to train doc2vec model from {} objects...'.format( source.upper())) transformer_name = kwargs.get('transformer_name') project_ids = kwargs.get('project_ids') vector_size = kwargs.get('vector_size') window = kwargs.get('window') min_count = kwargs.get('min_count') dm = kwargs.get('dm') transformer = Doc2VecTransformer(vector_size=vector_size, window=window, min_count=min_count, dm=dm) model_builder_args = dict(project_ids=project_ids, transformer_name=transformer_name) if source == 'document': model_builder = transformer.build_doc2vec_document_model else: model_builder = transformer.build_doc2vec_text_unit_model model_builder_args['text_unit_type'] = kwargs.get('text_unit_type') model_builder(**model_builder_args) app.register_task(TrainDoc2VecModel())