def init_model(self, model_factory_fn, model_name_override=None, weights_file_override=None, verbose=0, trainable=True, trained=False, load_weights=True) -> Model: model_name = model_factory_fn.__name__ if model_name_override is not None: model_name = model_name_override model = model_factory_fn(name=model_name, ctx=self, trained=trained) # model.name = model_name if verbose > 1: model.summary() ch_fn = os.path.join(self.model_checkpoint_path, f"{model_name}-{keras.__version__}.h5") if weights_file_override is not None: ch_fn = os.path.join(self.model_checkpoint_path, f"{weights_file_override}-{keras.__version__}.h5") if load_weights: try: model.load_weights(ch_fn) logger.info(f'weights loaded: {ch_fn}') except: msg = f'cannot load {model_name} from {ch_fn}' warnings.warn(msg) if trained: raise FileExistsError(msg) if not trainable: KerasTrainingContext.freezeModel(model) return model
def run(run_pahse_2=True, kind=None): # ----------------------- logger.info('-> PHASE 0 (charters)...') # NIL (сорян, в системе римских цифр отсутствует ноль) audit_charters_phase_1() if run_pahse_2: audit_charters_phase_2() # ----------------------- # I logger.info('-> PHASE I...') for audit in get_audits(): audit_phase_1(audit, kind) # ----------------------- # II logger.info('-> PHASE II..') if run_pahse_2: # phase 2 for audit in get_audits(): audit_phase_2(audit, kind) else: logger.info("phase 2 is skipped") # ----------------------- # III logger.info('-> PHASE III (finalize)...') finalizer.finalize()
def audit_phase_1(audit, kind=None): logger.info(f'.....processing audit {audit["_id"]}') ctx = AuditContext(audit["subsidiary"]["name"]) document_ids = get_docs_by_audit_id(audit["_id"], states=[DocumentState.New.value], kind=kind, id_only=True) _charter_ids = audit.get("charters", []) document_ids.extend(_charter_ids) for k, document_id in enumerate(document_ids): _document = finalizer.get_doc_by_id(document_id) jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors.get(jdoc.documentType) if processor is None: logger.warning( f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}' ) else: logger.info( f'......pre-processing {k} of {len(document_ids)} {jdoc.documentType}:{document_id}' ) if need_analysis(jdoc) and jdoc.isNew(): processor.preprocess(jdoc=jdoc, context=ctx)
def audit_phase_2(audit, kind=None): ctx = AuditContext(audit["subsidiary"]["name"]) print(f'.....processing audit {audit["_id"]}') document_ids = get_docs_by_audit_id( audit["_id"], states=[DocumentState.Preprocessed.value, DocumentState.Error.value], kind=kind, id_only=True) _charter_ids = audit.get("charters", []) document_ids.extend(_charter_ids) for k, document_id in enumerate(document_ids): _document = finalizer.get_doc_by_id(document_id) jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors.get(jdoc.documentType) if processor is None: logger.warning( f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}' ) else: if need_analysis(jdoc) and jdoc.isPreprocessed(): logger.info( f'.....processing {k} of {len(document_ids)} {jdoc.documentType} {document_id}' ) processor.process(jdoc, audit, ctx) change_audit_status(audit, "Finalizing") # TODO: check ALL docs in proper state
def finalize(): audits = get_audits() for audit in audits: if audit["subsidiary"]["name"] == "Все ДО": print(f'.....audit {audit["_id"]} finalizing skipped') continue logger.info(f'.....finalizing audit {audit["_id"]}') violations = [] contract_ids = get_docs_by_audit_id(audit["_id"], 15, "CONTRACT", id_only=True) charters = [] if audit.get("charters") is not None: for charter_id in audit["charters"]: charter = get_doc_by_id(charter_id) if (charter.get("isActive") is None or charter["isActive"]) and charter["state"] == 15: charters.append(charter) cleaned_charters = exclude_same_charters(charters) charters = sorted(cleaned_charters, key=lambda k: get_attrs(k)["date"]["value"]) protocols = get_docs_by_audit_id(audit["_id"], 15, "PROTOCOL", without_large_fields=True) for contract_id in contract_ids: contract = get_doc_by_id(contract_id["_id"]) violations.extend( check_contract(contract, charters, protocols, audit)) save_violations(audit, violations) print(f'.....audit {audit["_id"]} is waiting for approval')
def embedd_large(self, text_map, max_tokens=6000, log_addon=''): elmo_logger.info( f'{log_addon} {len(text_map)} max_tokens={max_tokens}') overlap = max_tokens // 20 number_of_windows = 1 + len(text_map) // max_tokens window = max_tokens msg = f"{log_addon} Document is too large for embedding: {len(text_map)} tokens. Splitting into {number_of_windows} windows overlapping with {overlap} tokens " elmo_logger.warning(msg) start = 0 embeddings = None # tokens = [] while start < len(text_map): subtokens: Tokens = text_map[start:start + window + overlap] elmo_logger.debug( f"{log_addon} Embedding region: {start}, {len(subtokens)}") sub_embeddings = self.embedd_tokens(subtokens)[0:window] if embeddings is None: embeddings = sub_embeddings else: embeddings = np.concatenate([embeddings, sub_embeddings]) start += window return embeddings
def __init__(self, checkpoints_path=models_path, session_index=0): self.session_index = session_index self.HISTORIES = {} self.model_checkpoint_path = checkpoints_path self.EVALUATE_ONLY = True self.EPOCHS = 18 self.trained_models = {} self.validation_steps = 1 self.steps_per_epoch = 1 self.reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=1E-6, verbose=1) logger.info(f"model_checkpoint_path: {checkpoints_path}")
def audit_charters_phase_1(): """preprocess""" charters = get_all_new_charters() processor: BaseProcessor = document_processors[CHARTER] for k, charter in enumerate(charters): jdoc = DbJsonDoc(charter) logger.info( f'......pre-processing {k} of {len(charters)} CHARTER {jdoc.get_id()}' ) ctx = AuditContext() processor.preprocess(jdoc, context=ctx)
def test_analyze_charter(self): processor: BaseProcessor = document_processors[CHARTER] doc = get_doc_by_id(ObjectId('5e5de70d01c6c73c19eebd48')) if doc is None: raise RuntimeError("fix unit test please") audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') ctx = AuditContext() processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx)
def test_analyze_contract(self): processor: BaseProcessor = document_processors[CONTRACT] doc = get_doc_by_id(ObjectId('5ded004e4ddc27bcf92dd47c')) if doc is None: raise RuntimeError("fix unit test please") audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') ctx = AuditContext() processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx)
def preprocess(self, jdoc: DbJsonDoc, context: AuditContext): # phase I # TODO: include phase I into phase II, remove phase I if jdoc.is_user_corrected(): logger.info( f"skipping doc {jdoc.get_id()} because it is corrected by user" ) # TODO: update state? else: legal_doc = jdoc.asLegalDoc() self.parser.find_org_date_number(legal_doc, context) save_analysis(jdoc, legal_doc, state=DocumentState.Preprocessed.value) return legal_doc
def audit_charters_phase_2(): # XXX: #TODO: DO NOT LOAD ALL CHARTERS AT ONCE charters = get_docs_by_audit_id( id=None, states=[DocumentState.Preprocessed.value, DocumentState.Error.value], kind=CHARTER) for k, _document in enumerate(charters): jdoc = DbJsonDoc(_document) processor: BaseProcessor = document_processors[CHARTER] logger.info( f'......processing {k} of {len(charters)} CHARTER {jdoc.get_id()}' ) ctx = AuditContext() processor.process(jdoc, audit=None, context=ctx)
def test_analyse_acontract(self): doc = get_doc_by_id(ObjectId('5fdb213f542ce403c92b4530')) # _db_client = MongoClient(f'mongodb://192.168.10.36:27017/') # _db_client.server_info() # db = _db_client['gpn'] # documents_collection = db['documents'] # doc = documents_collection.find_one({"_id": ObjectId('5fdb213f542ce403c92b4530')} ) # audit = db['audits'].find_one({'_id': doc['auditId']}) audit = get_audit_by_id(doc['auditId']) jdoc = DbJsonDoc(doc) logger.info(f'......pre-processing {jdoc._id}') _audit_subsidiary: str = audit["subsidiary"]["name"] ctx = AuditContext(_audit_subsidiary) processor: BaseProcessor = document_processors[CONTRACT] processor.preprocess(jdoc, context=ctx) processor.process(jdoc, audit, ctx) print(jdoc)
def process(self, db_document: DbJsonDoc, audit, context: AuditContext) -> LegalDocument: # phase II if db_document.retry_number is None: db_document.retry_number = 0 if db_document.retry_number > 2: logger.error( f'{db_document.documentType} {db_document.get_id()} exceeds maximum retries for analysis and is skipped' ) return None legal_doc = db_document.asLegalDoc() try: # self.parser.find_org_date_number(legal_doc, context) # todo: remove this call # todo: make sure it is done in phase 1, BUT phase 1 is deprecated ;-) # save_analysis(db_document, legal_doc, state=DocumentState.InWork.value) if audit is None or self.is_valid(audit, db_document): if db_document.is_user_corrected(): logger.info( f"skipping doc {db_document.get_id()} postprocessing because it is corrected by user" ) change_doc_state(db_document, state=DocumentState.Done.value) else: # ANALYSING self.parser.find_attributes(legal_doc, context) save_analysis(db_document, legal_doc, state=DocumentState.Done.value) # ANALYSING logger.info(f'analysis saved, doc._id={legal_doc.get_id()}') else: logger.info(f"excluding doc {db_document.get_id()}") # we re not saving doc here cuz we had NOT search for attrs change_doc_state(db_document, state=DocumentState.Excluded.value) except Exception as err: traceback.print_tb(err.__traceback__) logger.exception(f'cant process document {db_document.get_id()}') # TODO: do not save the entire doc here, data loss possible save_analysis(db_document, legal_doc, DocumentState.Error.value, db_document.retry_number + 1) return legal_doc
def resave_model_h5(self, model_factory_fn): model = self.init_model(model_factory_fn, load_weights=False) model.summary() model_name = model_factory_fn.__name__ ch_fn_old = os.path.join(self.model_checkpoint_path, f"{model_name}.weights") model.load_weights(ch_fn_old) logger.info(f'model weights loaded: {ch_fn_old}') ch_fn = os.path.join(self.model_checkpoint_path, f"{model_name}-{keras.__version__}.h5") if not os.path.isfile(ch_fn): model.save_weights(ch_fn) logger.info(f"model weights saved to {ch_fn}") else: logger.info(f"model weights NOT saved, because file exists {ch_fn}")
def _build_session_and_graph(self): embedding_graph = tf.compat.v1.Graph() with embedding_graph.as_default(): logger.info(f'< loading ELMO module {self.module_url}') logger.info( f'TF hub cache dir is {os.environ["TFHUB_CACHE_DIR"]}') _elmo = hub.Module(self.module_url, trainable=False) logger.info(f'ELMO module loaded >') self.text_input = tf.compat.v1.placeholder(dtype='string', name="text_input") self.text_lengths = tf.compat.v1.placeholder(dtype='int32', name='text_lengths') inputs_elmo = { "tokens": self.text_input, "sequence_len": self.text_lengths } inputs_default = {"strings": self.text_input} with embedding_graph.as_default(): logger.info(f'ELMO: creating embedded_out_elmo') self.embedded_out_elmo = _elmo(inputs=inputs_elmo, signature="tokens", as_dict=True)['elmo'] logger.info(f'ELMO: embedded_out_defaut embedded_out_elmo') self.embedded_out_defaut = _elmo(inputs=inputs_default, signature="default", as_dict=True)['default'] with embedding_graph.as_default(): self.session = tf.compat.v1.Session(graph=embedding_graph) init_op = tf.group([ tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer() ]) self.session.run(init_op) embedding_graph.finalize() logger.info(f'graph finalized >>')