def audit_phase_2(audit, kind=None): ctx = AuditContext(audit["subsidiary"]["name"]) print(f'.....processing audit {audit["_id"]}') document_ids = get_docs_by_audit_id( audit["_id"], states=[DocumentState.Preprocessed.value, DocumentState.Error.value], kind=kind, id_only=True) _charter_ids = audit.get("charters", []) document_ids.extend(_charter_ids) for k, document_id in enumerate(document_ids): _document = finalizer.get_doc_by_id(document_id) jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors.get(jdoc.documentType) if processor is None: logger.warning( f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}' ) else: if need_analysis(jdoc) and jdoc.isPreprocessed(): logger.info( f'.....processing {k} of {len(document_ids)} {jdoc.documentType} {document_id}' ) processor.process(jdoc, audit, ctx) change_audit_status(audit, "Finalizing") # TODO: check ALL docs in proper state
def audit_phase_1(audit, kind=None): logger.info(f'.....processing audit {audit["_id"]}') ctx = AuditContext(audit["subsidiary"]["name"]) document_ids = get_docs_by_audit_id(audit["_id"], states=[DocumentState.New.value], kind=kind, id_only=True) _charter_ids = audit.get("charters", []) document_ids.extend(_charter_ids) for k, document_id in enumerate(document_ids): _document = finalizer.get_doc_by_id(document_id) jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors.get(jdoc.documentType) if processor is None: logger.warning( f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}' ) else: logger.info( f'......pre-processing {k} of {len(document_ids)} {jdoc.documentType}:{document_id}' ) if need_analysis(jdoc) and jdoc.isNew(): processor.preprocess(jdoc=jdoc, context=ctx)
def test_from_json(self): doc, _, ctx = self._get_doc_factory_ctx() doc.__dict__['number'] = None # hack for old pickles doc.__dict__['date'] = None # hack for old pickles doc.__dict__['warnings'] = [] # hack for old pickles doc.__dict__['attributes_tree'] = ContractSchema( ) # hack for old pickles actx = AuditContext() ctx.find_attributes(doc, actx) json_struct = DocumentJson(doc) json_string = json.dumps(json_struct.__dict__, indent=4, ensure_ascii=False, default=json_util.default) restored: DocumentJson = DocumentJson.from_json_str(json_string) for key in restored.__dict__: print(key) self.assertIn(key, json_struct.__dict__.keys()) for key in restored.attributes: self.assertIn(key, json_struct.attributes.keys()) for key in json_struct.attributes: self.assertIn(key, restored.attributes.keys())
def nn_find_org_names(textmap: TextMap, semantic_map: DataFrame, audit_ctx: AuditContext) -> [ContractAgent]: contract_agents: [ContractAgent] = [] for o in [1, 2]: ca: ContractAgent = ContractAgent() for n in ['name', 'alias', 'type']: tagname = f'org-{o}-{n}' tag = nn_get_tag_value(tagname, textmap, semantic_map) setattr(ca, n, tag) normalize_contract_agent(ca) contract_agents.append(ca) def _name_val_safe(a): if a.name is not None: return a.name.value return '' if audit_ctx.audit_subsidiary_name: # known subsidiary goes first contract_agents = sorted( contract_agents, key=lambda a: not audit_ctx.is_same_org(_name_val_safe(a))) else: contract_agents = sorted(contract_agents, key=lambda a: _name_val_safe(a)) check_org_intersections(contract_agents) # mutator return contract_agents # _swap_org_tags(cas)
def test_charter_parser(self): # from renderer import SilentRenderer point1 = [1, 6, 4] emb = FakeEmbedder(point1) legal_doc = LegalDocument("1. ЮРИДИЧЕСКИЙ содержание 4.").parse() charter = CharterDocument(legal_doc).parse() charter_parser = CharterParser(emb, emb) actx = AuditContext() charter_parser.find_org_date_number(charter, actx) charter_parser.find_attributes(charter, actx) print(charter.warnings)
def test_analyze_contract_0(self): point1 = np.random.rand(1024) emb = FakeEmbedder(point1) ctx = ContractParser(emb) contract = ContractDocument("1. ЮРИДИЧЕСКИЙ содержание 4.") contract.parse() actx = AuditContext() ctx.find_org_date_number(contract, actx) ctx.find_attributes(contract, actx) ctx._logstep("analyze_contract")
def test_process_charters_phase_1(self): audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs: [dict] = get_docs_by_audit_id(audit_id, kind='CHARTER') processor = document_processors.get('CHARTER') for _doc in docs: jdoc = DbJsonDoc(_doc) processor.preprocess(jdoc, AuditContext())
def audit_charters_phase_1(): """preprocess""" charters = get_all_new_charters() processor: BaseProcessor = document_processors[CHARTER] for k, charter in enumerate(charters): jdoc = DbJsonDoc(charter) logger.info( f'......pre-processing {k} of {len(charters)} CHARTER {jdoc.get_id()}' ) ctx = AuditContext() processor.preprocess(jdoc, context=ctx)
def test_analyze_contract(self): processor: BaseProcessor = document_processors[CONTRACT] doc = get_doc_by_id(ObjectId('5ded004e4ddc27bcf92dd47c')) if doc is None: raise RuntimeError("fix unit test please") audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') ctx = AuditContext() processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx)
def test_analyze_charter(self): processor: BaseProcessor = document_processors[CHARTER] doc = get_doc_by_id(ObjectId('5e5de70d01c6c73c19eebd48')) if doc is None: raise RuntimeError("fix unit test please") audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') ctx = AuditContext() processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx)
def test_to_json(self): doc, factory, ctx = self._get_doc_factory_ctx() doc.__dict__['number'] = None # hack for old pickles doc.__dict__['date'] = None # hack for old pickles doc.__dict__['warnings'] = [] # hack for old pickles doc.__dict__['attributes_tree'] = ContractSchema( ) # hack for old pickles actx = AuditContext() ctx.find_attributes(doc, actx) json_struct = DocumentJson(doc) _j = json_struct.dumps() print(_j)
def test_process_contracts_phase_1(self): # runner = Runner.get_instance() audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind='CONTRACT') processor = document_processors.get('CONTRACT') for _doc in docs: jdoc = DbJsonDoc(_doc) processor.preprocess(jdoc, AuditContext())
def audit_charters_phase_2(): # XXX: #TODO: DO NOT LOAD ALL CHARTERS AT ONCE charters = get_docs_by_audit_id( id=None, states=[DocumentState.Preprocessed.value, DocumentState.Error.value], kind=CHARTER) for k, _document in enumerate(charters): jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors[CHARTER] logger.info( f'......processing {k} of {len(charters)} CHARTER {jdoc.get_id()}' ) ctx = AuditContext() processor.process(jdoc, audit=None, context=ctx)
def test_process_protocols_phase_1(self): runner = get_runner_instance_no_embedder() for audit in get_audits(): audit_id = audit['_id'] docs = get_docs_by_audit_id(audit_id, kind='PROTOCOL') for doc in docs: # charter = runner.make_legal_doc(doc) jdoc = DbJsonDoc(doc) legal_doc = jdoc.asLegalDoc() runner.protocol_parser.find_org_date_number( legal_doc, AuditContext()) save_analysis(jdoc, legal_doc, -1)
def test_contract_analyze(self): doc, factory, ctx = self._get_doc_factory_ctx() doc.__dict__['number'] = None # hack for old pickles doc.__dict__['date'] = None # hack for old pickles doc.__dict__['attributes_tree'] = ContractSchema( ) # hack for old pickles ctx.find_attributes(doc, AuditContext()) tags: [SemanticTag] = doc.get_tags() _tag = SemanticTag.find_by_kind(tags, ContractTags.Value.display_string) quote = doc.tokens_map.text_range(_tag.span) self.assertEqual('80000,00', quote) _tag = SemanticTag.find_by_kind(tags, ContractTags.Currency.display_string) quote = doc.tokens_map.text_range(_tag.span) self.assertEqual('рублей', quote)
def test_analyse_acontract(self): doc = get_doc_by_id(ObjectId('5fdb213f542ce403c92b4530')) # _db_client = MongoClient(f'mongodb://192.168.10.36:27017/') # _db_client.server_info() # db = _db_client['gpn'] # documents_collection = db['documents'] # doc = documents_collection.find_one({"_id": ObjectId('5fdb213f542ce403c92b4530')} ) # audit = db['audits'].find_one({'_id': doc['auditId']}) audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') _audit_subsidiary: str = audit["subsidiary"]["name"] ctx = AuditContext(_audit_subsidiary) processor: BaseProcessor = document_processors[CONTRACT] processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx) print(jdoc)
def test_get_org_names(self): parser = CharterParser() audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind=CHARTER) for db_document in docs: print(db_document['filename']) parsed_p_json = db_document['parse'] charter: CharterDocument = join_paragraphs( parsed_p_json, doc_id=db_document['_id']) # TODO: mind, this could be slow if embedding is required parser.find_org_date_number(charter, AuditContext()) for tag in charter.get_tags(): print(tag)
def _preprocess_single_doc(self, kind): for doc in self._get_doc_from_db(kind): d = DbJsonDoc(doc) processor = document_processors.get(kind) processor.preprocess(d, AuditContext())