예제 #1
0
파일: tools.py 프로젝트: nemoware/analyser
  def init_model(self, model_factory_fn, model_name_override=None, weights_file_override=None,
                 verbose=0,
                 trainable=True, trained=False, load_weights=True) -> Model:

    model_name = model_factory_fn.__name__
    if model_name_override is not None:
      model_name = model_name_override

    model = model_factory_fn(name=model_name, ctx=self, trained=trained)
    # model.name = model_name
    if verbose > 1:
      model.summary()

    ch_fn = os.path.join(self.model_checkpoint_path, f"{model_name}-{keras.__version__}.h5")

    if weights_file_override is not None:
      ch_fn = os.path.join(self.model_checkpoint_path, f"{weights_file_override}-{keras.__version__}.h5")

    if load_weights:
      try:
        model.load_weights(ch_fn)
        logger.info(f'weights loaded: {ch_fn}')
      except:
        msg = f'cannot load  {model_name} from  {ch_fn}'
        warnings.warn(msg)
        if trained:
          raise FileExistsError(msg)

    if not trainable:
      KerasTrainingContext.freezeModel(model)

    return model
예제 #2
0
def run(run_pahse_2=True, kind=None):
    # -----------------------
    logger.info('-> PHASE 0 (charters)...')
    # NIL (сорян, в системе римских цифр отсутствует ноль)
    audit_charters_phase_1()
    if run_pahse_2:
        audit_charters_phase_2()

    # -----------------------
    # I
    logger.info('-> PHASE I...')
    for audit in get_audits():
        audit_phase_1(audit, kind)

    # -----------------------
    # II
    logger.info('-> PHASE II..')
    if run_pahse_2:
        # phase 2
        for audit in get_audits():
            audit_phase_2(audit, kind)

    else:
        logger.info("phase 2 is skipped")

    # -----------------------
    # III
    logger.info('-> PHASE III (finalize)...')
    finalizer.finalize()
예제 #3
0
def audit_phase_1(audit, kind=None):
    logger.info(f'.....processing audit {audit["_id"]}')
    ctx = AuditContext(audit["subsidiary"]["name"])

    document_ids = get_docs_by_audit_id(audit["_id"],
                                        states=[DocumentState.New.value],
                                        kind=kind,
                                        id_only=True)
    _charter_ids = audit.get("charters", [])
    document_ids.extend(_charter_ids)

    for k, document_id in enumerate(document_ids):
        _document = finalizer.get_doc_by_id(document_id)
        jdoc = DbJsonDoc(_document)

        processor: BaseProcessor = document_processors.get(jdoc.documentType)
        if processor is None:
            logger.warning(
                f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}'
            )
        else:
            logger.info(
                f'......pre-processing {k} of {len(document_ids)}  {jdoc.documentType}:{document_id}'
            )
            if need_analysis(jdoc) and jdoc.isNew():
                processor.preprocess(jdoc=jdoc, context=ctx)
예제 #4
0
def audit_phase_2(audit, kind=None):
    ctx = AuditContext(audit["subsidiary"]["name"])

    print(f'.....processing audit {audit["_id"]}')

    document_ids = get_docs_by_audit_id(
        audit["_id"],
        states=[DocumentState.Preprocessed.value, DocumentState.Error.value],
        kind=kind,
        id_only=True)

    _charter_ids = audit.get("charters", [])
    document_ids.extend(_charter_ids)

    for k, document_id in enumerate(document_ids):
        _document = finalizer.get_doc_by_id(document_id)
        jdoc = DbJsonDoc(_document)

        processor: BaseProcessor = document_processors.get(jdoc.documentType)
        if processor is None:
            logger.warning(
                f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}'
            )
        else:
            if need_analysis(jdoc) and jdoc.isPreprocessed():
                logger.info(
                    f'.....processing  {k} of {len(document_ids)}   {jdoc.documentType} {document_id}'
                )
                processor.process(jdoc, audit, ctx)

    change_audit_status(audit,
                        "Finalizing")  # TODO: check ALL docs in proper state
예제 #5
0
def finalize():
    audits = get_audits()
    for audit in audits:
        if audit["subsidiary"]["name"] == "Все ДО":
            print(f'.....audit {audit["_id"]} finalizing skipped')
            continue
        logger.info(f'.....finalizing audit {audit["_id"]}')
        violations = []
        contract_ids = get_docs_by_audit_id(audit["_id"],
                                            15,
                                            "CONTRACT",
                                            id_only=True)
        charters = []
        if audit.get("charters") is not None:
            for charter_id in audit["charters"]:
                charter = get_doc_by_id(charter_id)
                if (charter.get("isActive") is None
                        or charter["isActive"]) and charter["state"] == 15:
                    charters.append(charter)
            cleaned_charters = exclude_same_charters(charters)
            charters = sorted(cleaned_charters,
                              key=lambda k: get_attrs(k)["date"]["value"])
        protocols = get_docs_by_audit_id(audit["_id"],
                                         15,
                                         "PROTOCOL",
                                         without_large_fields=True)

        for contract_id in contract_ids:
            contract = get_doc_by_id(contract_id["_id"])
            violations.extend(
                check_contract(contract, charters, protocols, audit))

        save_violations(audit, violations)
        print(f'.....audit {audit["_id"]} is waiting for approval')
예제 #6
0
    def embedd_large(self, text_map, max_tokens=6000, log_addon=''):
        elmo_logger.info(
            f'{log_addon} {len(text_map)} max_tokens={max_tokens}')
        overlap = max_tokens // 20

        number_of_windows = 1 + len(text_map) // max_tokens
        window = max_tokens

        msg = f"{log_addon} Document is too large for embedding: {len(text_map)} tokens. Splitting into {number_of_windows} windows overlapping with {overlap} tokens "
        elmo_logger.warning(msg)

        start = 0
        embeddings = None
        # tokens = []
        while start < len(text_map):

            subtokens: Tokens = text_map[start:start + window + overlap]
            elmo_logger.debug(
                f"{log_addon} Embedding region: {start}, {len(subtokens)}")

            sub_embeddings = self.embedd_tokens(subtokens)[0:window]

            if embeddings is None:
                embeddings = sub_embeddings
            else:
                embeddings = np.concatenate([embeddings, sub_embeddings])

            start += window

        return embeddings
예제 #7
0
파일: tools.py 프로젝트: nemoware/analyser
  def __init__(self, checkpoints_path=models_path, session_index=0):
    self.session_index = session_index
    self.HISTORIES = {}
    self.model_checkpoint_path = checkpoints_path
    self.EVALUATE_ONLY = True
    self.EPOCHS = 18
    self.trained_models = {}
    self.validation_steps = 1
    self.steps_per_epoch = 1

    self.reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=1E-6, verbose=1)
    logger.info(f"model_checkpoint_path: {checkpoints_path}")
예제 #8
0
def audit_charters_phase_1():
    """preprocess"""
    charters = get_all_new_charters()
    processor: BaseProcessor = document_processors[CHARTER]

    for k, charter in enumerate(charters):
        jdoc = DbJsonDoc(charter)
        logger.info(
            f'......pre-processing {k} of {len(charters)} CHARTER {jdoc.get_id()}'
        )
        ctx = AuditContext()
        processor.preprocess(jdoc, context=ctx)
예제 #9
0
    def test_analyze_charter(self):
        processor: BaseProcessor = document_processors[CHARTER]
        doc = get_doc_by_id(ObjectId('5e5de70d01c6c73c19eebd48'))
        if doc is None:
            raise RuntimeError("fix unit test please")

        audit = get_audit_by_id(doc['auditId'])

        jdoc = DbJsonDoc(doc)
        logger.info(f'......pre-processing {jdoc._id}')
        ctx = AuditContext()
        processor.preprocess(jdoc, context=ctx)
        processor.process(jdoc, audit, ctx)
예제 #10
0
    def test_analyze_contract(self):
        processor: BaseProcessor = document_processors[CONTRACT]
        doc = get_doc_by_id(ObjectId('5ded004e4ddc27bcf92dd47c'))
        if doc is None:
            raise RuntimeError("fix unit test please")

        audit = get_audit_by_id(doc['auditId'])

        jdoc = DbJsonDoc(doc)
        logger.info(f'......pre-processing {jdoc._id}')
        ctx = AuditContext()
        processor.preprocess(jdoc, context=ctx)
        processor.process(jdoc, audit, ctx)
예제 #11
0
 def preprocess(self, jdoc: DbJsonDoc, context: AuditContext):
     # phase I
     # TODO: include phase I into phase II, remove phase I
     if jdoc.is_user_corrected():
         logger.info(
             f"skipping doc {jdoc.get_id()} because it is corrected by user"
         )
         # TODO: update state?
     else:
         legal_doc = jdoc.asLegalDoc()
         self.parser.find_org_date_number(legal_doc, context)
         save_analysis(jdoc,
                       legal_doc,
                       state=DocumentState.Preprocessed.value)
         return legal_doc
예제 #12
0
def audit_charters_phase_2():  # XXX: #TODO: DO NOT LOAD ALL CHARTERS AT ONCE
    charters = get_docs_by_audit_id(
        id=None,
        states=[DocumentState.Preprocessed.value, DocumentState.Error.value],
        kind=CHARTER)

    for k, _document in enumerate(charters):
        jdoc = DbJsonDoc(_document)
        processor: BaseProcessor = document_processors[CHARTER]

        logger.info(
            f'......processing  {k} of {len(charters)}  CHARTER {jdoc.get_id()}'
        )
        ctx = AuditContext()
        processor.process(jdoc, audit=None, context=ctx)
예제 #13
0
    def test_analyse_acontract(self):

        doc = get_doc_by_id(ObjectId('5fdb213f542ce403c92b4530'))
        # _db_client = MongoClient(f'mongodb://192.168.10.36:27017/')
        # _db_client.server_info()

        # db = _db_client['gpn']

        # documents_collection = db['documents']

        # doc = documents_collection.find_one({"_id": ObjectId('5fdb213f542ce403c92b4530')} )
        # audit = db['audits'].find_one({'_id': doc['auditId']})
        audit = get_audit_by_id(doc['auditId'])
        jdoc = DbJsonDoc(doc)
        logger.info(f'......pre-processing {jdoc._id}')
        _audit_subsidiary: str = audit["subsidiary"]["name"]

        ctx = AuditContext(_audit_subsidiary)
        processor: BaseProcessor = document_processors[CONTRACT]
        processor.preprocess(jdoc, context=ctx)
        processor.process(jdoc, audit, ctx)
        print(jdoc)
예제 #14
0
    def process(self, db_document: DbJsonDoc, audit,
                context: AuditContext) -> LegalDocument:
        # phase II
        if db_document.retry_number is None:
            db_document.retry_number = 0

        if db_document.retry_number > 2:
            logger.error(
                f'{db_document.documentType} {db_document.get_id()} exceeds maximum retries for analysis and is skipped'
            )
            return None

        legal_doc = db_document.asLegalDoc()
        try:

            # self.parser.find_org_date_number(legal_doc, context) # todo: remove this call
            # todo: make sure it is done in phase 1, BUT phase 1 is deprecated ;-)
            # save_analysis(db_document, legal_doc, state=DocumentState.InWork.value)

            if audit is None or self.is_valid(audit, db_document):

                if db_document.is_user_corrected():
                    logger.info(
                        f"skipping doc {db_document.get_id()} postprocessing because it is corrected by user"
                    )
                    change_doc_state(db_document,
                                     state=DocumentState.Done.value)
                else:
                    # ANALYSING
                    self.parser.find_attributes(legal_doc, context)
                    save_analysis(db_document,
                                  legal_doc,
                                  state=DocumentState.Done.value)
                    # ANALYSING

                logger.info(f'analysis saved, doc._id={legal_doc.get_id()}')
            else:
                logger.info(f"excluding doc {db_document.get_id()}")
                # we re not saving doc here cuz we had NOT search for attrs
                change_doc_state(db_document,
                                 state=DocumentState.Excluded.value)

        except Exception as err:
            traceback.print_tb(err.__traceback__)
            logger.exception(f'cant process document {db_document.get_id()}')
            # TODO: do not save the entire doc here, data loss possible
            save_analysis(db_document, legal_doc, DocumentState.Error.value,
                          db_document.retry_number + 1)

        return legal_doc
예제 #15
0
파일: tools.py 프로젝트: nemoware/analyser
  def resave_model_h5(self, model_factory_fn):
    model = self.init_model(model_factory_fn, load_weights=False)
    model.summary()
    model_name = model_factory_fn.__name__
    ch_fn_old = os.path.join(self.model_checkpoint_path, f"{model_name}.weights")
    model.load_weights(ch_fn_old)
    logger.info(f'model weights loaded: {ch_fn_old}')

    ch_fn = os.path.join(self.model_checkpoint_path, f"{model_name}-{keras.__version__}.h5")

    if not os.path.isfile(ch_fn):
      model.save_weights(ch_fn)
      logger.info(f"model weights saved to {ch_fn}")

    else:
      logger.info(f"model weights NOT saved, because file exists {ch_fn}")
예제 #16
0
    def _build_session_and_graph(self):

        embedding_graph = tf.compat.v1.Graph()

        with embedding_graph.as_default():
            logger.info(f'< loading ELMO module {self.module_url}')
            logger.info(
                f'TF hub cache dir is  {os.environ["TFHUB_CACHE_DIR"]}')
            _elmo = hub.Module(self.module_url, trainable=False)
            logger.info(f'ELMO module loaded >')

            self.text_input = tf.compat.v1.placeholder(dtype='string',
                                                       name="text_input")
            self.text_lengths = tf.compat.v1.placeholder(dtype='int32',
                                                         name='text_lengths')

        inputs_elmo = {
            "tokens": self.text_input,
            "sequence_len": self.text_lengths
        }

        inputs_default = {"strings": self.text_input}

        with embedding_graph.as_default():
            logger.info(f'ELMO: creating embedded_out_elmo')
            self.embedded_out_elmo = _elmo(inputs=inputs_elmo,
                                           signature="tokens",
                                           as_dict=True)['elmo']

            logger.info(f'ELMO: embedded_out_defaut embedded_out_elmo')
            self.embedded_out_defaut = _elmo(inputs=inputs_default,
                                             signature="default",
                                             as_dict=True)['default']

        with embedding_graph.as_default():
            self.session = tf.compat.v1.Session(graph=embedding_graph)
            init_op = tf.group([
                tf.compat.v1.global_variables_initializer(),
                tf.compat.v1.tables_initializer()
            ])
            self.session.run(init_op)

        embedding_graph.finalize()
        logger.info(f'graph finalized >>')