def export_updated_contracts_to_json(document_ids, work_dir): arr = {} n = 0 for k, doc_id in enumerate(document_ids): d = get_doc_by_id(doc_id) # if '_id' not in d['user']['author']: # print(f'error: user attributes doc {d["_id"]} is not linked to any user') if 'auditId' not in d: logger.warning(f'error: doc {d["_id"]} is not linked to any audit') arr[str(d['_id'])] = d # arr.append(d) logger.debug(f"exporting JSON {k} {d['_id']}") n = k with open(os.path.join(work_dir, 'contracts_mongo.json'), 'w', encoding='utf-8') as outfile: json.dump(arr, outfile, indent=2, ensure_ascii=False, default=json_util.default) logger.info(f'EXPORTED {n} docs')
def audit_phase_1(audit, kind=None): logger.info(f'.....processing audit {audit["_id"]}') ctx = AuditContext(audit["subsidiary"]["name"]) document_ids = get_docs_by_audit_id(audit["_id"], states=[DocumentState.New.value], kind=kind, id_only=True) _charter_ids = audit.get("charters", []) document_ids.extend(_charter_ids) for k, document_id in enumerate(document_ids): _document = finalizer.get_doc_by_id(document_id) jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors.get(jdoc.documentType) if processor is None: logger.warning( f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}' ) else: logger.info( f'......pre-processing {k} of {len(document_ids)} {jdoc.documentType}:{document_id}' ) if need_analysis(jdoc) and jdoc.isNew(): processor.preprocess(jdoc=jdoc, context=ctx)
def audit_phase_2(audit, kind=None): ctx = AuditContext(audit["subsidiary"]["name"]) print(f'.....processing audit {audit["_id"]}') document_ids = get_docs_by_audit_id( audit["_id"], states=[DocumentState.Preprocessed.value, DocumentState.Error.value], kind=kind, id_only=True) _charter_ids = audit.get("charters", []) document_ids.extend(_charter_ids) for k, document_id in enumerate(document_ids): _document = finalizer.get_doc_by_id(document_id) jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors.get(jdoc.documentType) if processor is None: logger.warning( f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}' ) else: if need_analysis(jdoc) and jdoc.isPreprocessed(): logger.info( f'.....processing {k} of {len(document_ids)} {jdoc.documentType} {document_id}' ) processor.process(jdoc, audit, ctx) change_audit_status(audit, "Finalizing") # TODO: check ALL docs in proper state
def _test_contract(): doc = get_doc_by_id(ObjectId('5f0bb4bd138e9184feef1fa8')) a = doc['analysis']['attributes'] tree = {"contract": convert_contract_db_attributes_to_tree(a)} j, json_str = to_json(tree) return j, json_str, doc
def _test_charter(): doc = get_doc_by_id(ObjectId('5f64161009d100a445b7b0d6')) a = doc['analysis']['attributes'] tree = {"charter": convert_charter_db_attributes_to_tree(a)} j, json_str = to_json(tree) return j, json_str, doc
def import_recent_contracts(self): self.stats: DataFrame = self.load_contract_trainset_meta() docs_ids = [i["_id"] for i in self.get_updated_contracts()] for oid in docs_ids: d = get_doc_by_id(oid) self.save_contract_datapoint(DbJsonDoc(d)) self._save_stats()
def get_attributes_tree(id: str): # x = json.loads(data, object_hook=lambda d: SimpleNamespace(**d)) # print(x.name, x.hometown.name, x.hometown.id) db = get_mongodb_connection() doc = get_doc_by_id(ObjectId(id)) analysis = doc.get('analysis') if analysis: tree = analysis.get('attributes_tree') r = dotdict(tree) return r.charter
def test_analyze_charter(self): processor: BaseProcessor = document_processors[CHARTER] doc = get_doc_by_id(ObjectId('5e5de70d01c6c73c19eebd48')) if doc is None: raise RuntimeError("fix unit test please") audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') ctx = AuditContext() processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx)
def test_analyze_contract(self): processor: BaseProcessor = document_processors[CONTRACT] doc = get_doc_by_id(ObjectId('5ded004e4ddc27bcf92dd47c')) if doc is None: raise RuntimeError("fix unit test please") audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') ctx = AuditContext() processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx)
def _get_doc_from_db(self, kind): audits = get_mongodb_connection()['audits'].find().sort([ ("createDate", pymongo.ASCENDING) ]).limit(1) for audit in audits: doc_ids = get_docs_by_audit_id(audit['_id'], kind=kind, states=[15], id_only=True) if len(doc_ids) > 0: print(doc_ids[0]) doc = finalizer.get_doc_by_id(doc_ids[0]) # jdoc = DbJsonDoc(doc) yield doc
def test_analyse_acontract(self): doc = get_doc_by_id(ObjectId('5fdb213f542ce403c92b4530')) # _db_client = MongoClient(f'mongodb://192.168.10.36:27017/') # _db_client.server_info() # db = _db_client['gpn'] # documents_collection = db['documents'] # doc = documents_collection.find_one({"_id": ObjectId('5fdb213f542ce403c92b4530')} ) # audit = db['audits'].find_one({'_id': doc['auditId']}) audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') _audit_subsidiary: str = audit["subsidiary"]["name"] ctx = AuditContext(_audit_subsidiary) processor: BaseProcessor = document_processors[CONTRACT] processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx) print(jdoc)
def _test_protocol(): db = get_mongodb_connection() doc = get_doc_by_id(ObjectId('5df7a66b200a3f4d0fad786f')) # protocol convert_one(db, doc)