def audit_phase_2(audit, kind=None): ctx = AuditContext(audit["subsidiary"]["name"]) print(f'.....processing audit {audit["_id"]}') document_ids = get_docs_by_audit_id( audit["_id"], states=[DocumentState.Preprocessed.value, DocumentState.Error.value], kind=kind, id_only=True) _charter_ids = audit.get("charters", []) document_ids.extend(_charter_ids) for k, document_id in enumerate(document_ids): _document = finalizer.get_doc_by_id(document_id) jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors.get(jdoc.documentType) if processor is None: logger.warning( f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}' ) else: if need_analysis(jdoc) and jdoc.isPreprocessed(): logger.info( f'.....processing {k} of {len(document_ids)} {jdoc.documentType} {document_id}' ) processor.process(jdoc, audit, ctx) change_audit_status(audit, "Finalizing") # TODO: check ALL docs in proper state
def audit_phase_1(audit, kind=None): logger.info(f'.....processing audit {audit["_id"]}') ctx = AuditContext(audit["subsidiary"]["name"]) document_ids = get_docs_by_audit_id(audit["_id"], states=[DocumentState.New.value], kind=kind, id_only=True) _charter_ids = audit.get("charters", []) document_ids.extend(_charter_ids) for k, document_id in enumerate(document_ids): _document = finalizer.get_doc_by_id(document_id) jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors.get(jdoc.documentType) if processor is None: logger.warning( f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}' ) else: logger.info( f'......pre-processing {k} of {len(document_ids)} {jdoc.documentType}:{document_id}' ) if need_analysis(jdoc) and jdoc.isNew(): processor.preprocess(jdoc=jdoc, context=ctx)
def embedd_large(self, text_map, max_tokens=6000, log_addon=''): elmo_logger.info( f'{log_addon} {len(text_map)} max_tokens={max_tokens}') overlap = max_tokens // 20 number_of_windows = 1 + len(text_map) // max_tokens window = max_tokens msg = f"{log_addon} Document is too large for embedding: {len(text_map)} tokens. Splitting into {number_of_windows} windows overlapping with {overlap} tokens " elmo_logger.warning(msg) start = 0 embeddings = None # tokens = [] while start < len(text_map): subtokens: Tokens = text_map[start:start + window + overlap] elmo_logger.debug( f"{log_addon} Embedding region: {start}, {len(subtokens)}") sub_embeddings = self.embedd_tokens(subtokens)[0:window] if embeddings is None: embeddings = sub_embeddings else: embeddings = np.concatenate([embeddings, sub_embeddings]) start += window return embeddings
def test_get_docs_by_audit_id(self): audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind='PROTOCOL') for a in docs: print(a['_id'], a['filename'])
def test_process_charters_phase_1(self): audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs: [dict] = get_docs_by_audit_id(audit_id, kind='CHARTER') processor = document_processors.get('CHARTER') for _doc in docs: jdoc = DbJsonDoc(_doc) processor.preprocess(jdoc, AuditContext())
def test_process_contracts_phase_1(self): # runner = Runner.get_instance() audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind='CONTRACT') processor = document_processors.get('CONTRACT') for _doc in docs: jdoc = DbJsonDoc(_doc) processor.preprocess(jdoc, AuditContext())
def test_get_org_names(self): parser = CharterParser() audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind=CHARTER) for db_document in docs: print(db_document['filename']) parsed_p_json = db_document['parse'] charter: CharterDocument = join_paragraphs( parsed_p_json, doc_id=db_document['_id']) # TODO: mind, this could be slow if embedding is required parser.find_org_date_number(charter, AuditContext()) for tag in charter.get_tags(): print(tag)