def test_get_docs_by_audit_id(self): audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind='PROTOCOL') for a in docs: print(a['_id'], a['filename'])
def test_process_charters_phase_1(self): audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs: [dict] = get_docs_by_audit_id(audit_id, kind='CHARTER') processor = document_processors.get('CHARTER') for _doc in docs: jdoc = DbJsonDoc(_doc) processor.preprocess(jdoc, AuditContext())
def _get_doc_from_db(self, kind): audits = get_mongodb_connection()['audits'].find().sort([ ("createDate", pymongo.ASCENDING) ]).limit(1) for audit in audits: doc_ids = get_docs_by_audit_id(audit['_id'], kind=kind, states=[15], id_only=True) if len(doc_ids) > 0: print(doc_ids[0]) doc = finalizer.get_doc_by_id(doc_ids[0]) # jdoc = DbJsonDoc(doc) yield doc
def test_process_contracts_phase_1(self): # runner = Runner.get_instance() audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind='CONTRACT') processor = document_processors.get('CONTRACT') for _doc in docs: jdoc = DbJsonDoc(_doc) processor.preprocess(jdoc, AuditContext())
def test_process_protocols_phase_1(self): runner = get_runner_instance_no_embedder() for audit in get_audits(): audit_id = audit['_id'] docs = get_docs_by_audit_id(audit_id, kind='PROTOCOL') for doc in docs: # charter = runner.make_legal_doc(doc) jdoc = DbJsonDoc(doc) legal_doc = jdoc.asLegalDoc() runner.protocol_parser.find_org_date_number( legal_doc, AuditContext()) save_analysis(jdoc, legal_doc, -1)
def test_get_org_names(self): parser = CharterParser() audits = get_audits() if len(audits) == 0: logger.warning('no audits') return audit_id = audits[0]['_id'] docs = get_docs_by_audit_id(audit_id, kind=CHARTER) for db_document in docs: print(db_document['filename']) parsed_p_json = db_document['parse'] charter: CharterDocument = join_paragraphs( parsed_p_json, doc_id=db_document['_id']) # TODO: mind, this could be slow if embedding is required parser.find_org_date_number(charter, AuditContext()) for tag in charter.get_tags(): print(tag)