Beispiel #1
0
def audit_phase_2(audit, kind=None):
    ctx = AuditContext(audit["subsidiary"]["name"])

    print(f'.....processing audit {audit["_id"]}')

    document_ids = get_docs_by_audit_id(
        audit["_id"],
        states=[DocumentState.Preprocessed.value, DocumentState.Error.value],
        kind=kind,
        id_only=True)

    _charter_ids = audit.get("charters", [])
    document_ids.extend(_charter_ids)

    for k, document_id in enumerate(document_ids):
        _document = finalizer.get_doc_by_id(document_id)
        jdoc = DbJsonDoc(_document)

        processor: BaseProcessor = document_processors.get(jdoc.documentType)
        if processor is None:
            logger.warning(
                f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}'
            )
        else:
            if need_analysis(jdoc) and jdoc.isPreprocessed():
                logger.info(
                    f'.....processing  {k} of {len(document_ids)}   {jdoc.documentType} {document_id}'
                )
                processor.process(jdoc, audit, ctx)

    change_audit_status(audit,
                        "Finalizing")  # TODO: check ALL docs in proper state
Beispiel #2
0
def audit_phase_1(audit, kind=None):
    logger.info(f'.....processing audit {audit["_id"]}')
    ctx = AuditContext(audit["subsidiary"]["name"])

    document_ids = get_docs_by_audit_id(audit["_id"],
                                        states=[DocumentState.New.value],
                                        kind=kind,
                                        id_only=True)
    _charter_ids = audit.get("charters", [])
    document_ids.extend(_charter_ids)

    for k, document_id in enumerate(document_ids):
        _document = finalizer.get_doc_by_id(document_id)
        jdoc = DbJsonDoc(_document)

        processor: BaseProcessor = document_processors.get(jdoc.documentType)
        if processor is None:
            logger.warning(
                f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}'
            )
        else:
            logger.info(
                f'......pre-processing {k} of {len(document_ids)}  {jdoc.documentType}:{document_id}'
            )
            if need_analysis(jdoc) and jdoc.isNew():
                processor.preprocess(jdoc=jdoc, context=ctx)
Beispiel #3
0
    def test_from_json(self):
        doc, _, ctx = self._get_doc_factory_ctx()

        doc.__dict__['number'] = None  # hack for old pickles
        doc.__dict__['date'] = None  # hack for old pickles
        doc.__dict__['warnings'] = []  # hack for old pickles
        doc.__dict__['attributes_tree'] = ContractSchema(
        )  # hack for old pickles
        actx = AuditContext()
        ctx.find_attributes(doc, actx)
        json_struct = DocumentJson(doc)
        json_string = json.dumps(json_struct.__dict__,
                                 indent=4,
                                 ensure_ascii=False,
                                 default=json_util.default)

        restored: DocumentJson = DocumentJson.from_json_str(json_string)
        for key in restored.__dict__:
            print(key)
            self.assertIn(key, json_struct.__dict__.keys())

        for key in restored.attributes:
            self.assertIn(key, json_struct.attributes.keys())

        for key in json_struct.attributes:
            self.assertIn(key, restored.attributes.keys())
Beispiel #4
0
def nn_find_org_names(textmap: TextMap, semantic_map: DataFrame,
                      audit_ctx: AuditContext) -> [ContractAgent]:
    contract_agents: [ContractAgent] = []
    for o in [1, 2]:
        ca: ContractAgent = ContractAgent()
        for n in ['name', 'alias', 'type']:
            tagname = f'org-{o}-{n}'
            tag = nn_get_tag_value(tagname, textmap, semantic_map)
            setattr(ca, n, tag)
        normalize_contract_agent(ca)
        contract_agents.append(ca)

    def _name_val_safe(a):
        if a.name is not None:
            return a.name.value
        return ''

    if audit_ctx.audit_subsidiary_name:
        # known subsidiary goes first
        contract_agents = sorted(
            contract_agents,
            key=lambda a: not audit_ctx.is_same_org(_name_val_safe(a)))
    else:
        contract_agents = sorted(contract_agents,
                                 key=lambda a: _name_val_safe(a))

    check_org_intersections(contract_agents)  # mutator

    return contract_agents  # _swap_org_tags(cas)
Beispiel #5
0
    def test_charter_parser(self):
        # from renderer import SilentRenderer
        point1 = [1, 6, 4]
        emb = FakeEmbedder(point1)
        legal_doc = LegalDocument("1. ЮРИДИЧЕСКИЙ содержание 4.").parse()
        charter = CharterDocument(legal_doc).parse()
        charter_parser = CharterParser(emb, emb)
        actx = AuditContext()
        charter_parser.find_org_date_number(charter, actx)
        charter_parser.find_attributes(charter, actx)

        print(charter.warnings)
Beispiel #6
0
    def test_analyze_contract_0(self):
        point1 = np.random.rand(1024)
        emb = FakeEmbedder(point1)

        ctx = ContractParser(emb)
        contract = ContractDocument("1. ЮРИДИЧЕСКИЙ содержание 4.")
        contract.parse()
        actx = AuditContext()
        ctx.find_org_date_number(contract, actx)
        ctx.find_attributes(contract, actx)

        ctx._logstep("analyze_contract")
Beispiel #7
0
    def test_process_charters_phase_1(self):
        audits = get_audits()
        if len(audits) == 0:
            logger.warning('no audits')
            return

        audit_id = audits[0]['_id']
        docs: [dict] = get_docs_by_audit_id(audit_id, kind='CHARTER')
        processor = document_processors.get('CHARTER')
        for _doc in docs:
            jdoc = DbJsonDoc(_doc)
            processor.preprocess(jdoc, AuditContext())
Beispiel #8
0
def audit_charters_phase_1():
    """preprocess"""
    charters = get_all_new_charters()
    processor: BaseProcessor = document_processors[CHARTER]

    for k, charter in enumerate(charters):
        jdoc = DbJsonDoc(charter)
        logger.info(
            f'......pre-processing {k} of {len(charters)} CHARTER {jdoc.get_id()}'
        )
        ctx = AuditContext()
        processor.preprocess(jdoc, context=ctx)
Beispiel #9
0
    def test_analyze_contract(self):
        processor: BaseProcessor = document_processors[CONTRACT]
        doc = get_doc_by_id(ObjectId('5ded004e4ddc27bcf92dd47c'))
        if doc is None:
            raise RuntimeError("fix unit test please")

        audit = get_audit_by_id(doc['auditId'])

        jdoc = DbJsonDoc(doc)
        logger.info(f'......pre-processing {jdoc._id}')
        ctx = AuditContext()
        processor.preprocess(jdoc, context=ctx)
        processor.process(jdoc, audit, ctx)
Beispiel #10
0
    def test_analyze_charter(self):
        processor: BaseProcessor = document_processors[CHARTER]
        doc = get_doc_by_id(ObjectId('5e5de70d01c6c73c19eebd48'))
        if doc is None:
            raise RuntimeError("fix unit test please")

        audit = get_audit_by_id(doc['auditId'])

        jdoc = DbJsonDoc(doc)
        logger.info(f'......pre-processing {jdoc._id}')
        ctx = AuditContext()
        processor.preprocess(jdoc, context=ctx)
        processor.process(jdoc, audit, ctx)
Beispiel #11
0
    def test_to_json(self):
        doc, factory, ctx = self._get_doc_factory_ctx()

        doc.__dict__['number'] = None  # hack for old pickles
        doc.__dict__['date'] = None  # hack for old pickles
        doc.__dict__['warnings'] = []  # hack for old pickles
        doc.__dict__['attributes_tree'] = ContractSchema(
        )  # hack for old pickles

        actx = AuditContext()
        ctx.find_attributes(doc, actx)
        json_struct = DocumentJson(doc)
        _j = json_struct.dumps()
        print(_j)
Beispiel #12
0
    def test_process_contracts_phase_1(self):
        # runner = Runner.get_instance()

        audits = get_audits()
        if len(audits) == 0:
            logger.warning('no audits')
            return

        audit_id = audits[0]['_id']

        docs = get_docs_by_audit_id(audit_id, kind='CONTRACT')
        processor = document_processors.get('CONTRACT')
        for _doc in docs:
            jdoc = DbJsonDoc(_doc)
            processor.preprocess(jdoc, AuditContext())
Beispiel #13
0
def audit_charters_phase_2():  # XXX: #TODO: DO NOT LOAD ALL CHARTERS AT ONCE
    charters = get_docs_by_audit_id(
        id=None,
        states=[DocumentState.Preprocessed.value, DocumentState.Error.value],
        kind=CHARTER)

    for k, _document in enumerate(charters):
        jdoc = DbJsonDoc(_document)
        processor: BaseProcessor = document_processors[CHARTER]

        logger.info(
            f'......processing  {k} of {len(charters)}  CHARTER {jdoc.get_id()}'
        )
        ctx = AuditContext()
        processor.process(jdoc, audit=None, context=ctx)
Beispiel #14
0
    def test_process_protocols_phase_1(self):
        runner = get_runner_instance_no_embedder()

        for audit in get_audits():
            audit_id = audit['_id']
            docs = get_docs_by_audit_id(audit_id, kind='PROTOCOL')

            for doc in docs:
                # charter = runner.make_legal_doc(doc)

                jdoc = DbJsonDoc(doc)
                legal_doc = jdoc.asLegalDoc()

                runner.protocol_parser.find_org_date_number(
                    legal_doc, AuditContext())
                save_analysis(jdoc, legal_doc, -1)
Beispiel #15
0
    def test_contract_analyze(self):
        doc, factory, ctx = self._get_doc_factory_ctx()
        doc.__dict__['number'] = None  # hack for old pickles
        doc.__dict__['date'] = None  # hack for old pickles
        doc.__dict__['attributes_tree'] = ContractSchema(
        )  # hack for old pickles

        ctx.find_attributes(doc, AuditContext())
        tags: [SemanticTag] = doc.get_tags()

        _tag = SemanticTag.find_by_kind(tags,
                                        ContractTags.Value.display_string)
        quote = doc.tokens_map.text_range(_tag.span)
        self.assertEqual('80000,00', quote)

        _tag = SemanticTag.find_by_kind(tags,
                                        ContractTags.Currency.display_string)
        quote = doc.tokens_map.text_range(_tag.span)
        self.assertEqual('рублей', quote)
Beispiel #16
0
    def test_analyse_acontract(self):

        doc = get_doc_by_id(ObjectId('5fdb213f542ce403c92b4530'))
        # _db_client = MongoClient(f'mongodb://192.168.10.36:27017/')
        # _db_client.server_info()

        # db = _db_client['gpn']

        # documents_collection = db['documents']

        # doc = documents_collection.find_one({"_id": ObjectId('5fdb213f542ce403c92b4530')} )
        # audit = db['audits'].find_one({'_id': doc['auditId']})
        audit = get_audit_by_id(doc['auditId'])
        jdoc = DbJsonDoc(doc)
        logger.info(f'......pre-processing {jdoc._id}')
        _audit_subsidiary: str = audit["subsidiary"]["name"]

        ctx = AuditContext(_audit_subsidiary)
        processor: BaseProcessor = document_processors[CONTRACT]
        processor.preprocess(jdoc, context=ctx)
        processor.process(jdoc, audit, ctx)
        print(jdoc)
Beispiel #17
0
    def test_get_org_names(self):
        parser = CharterParser()

        audits = get_audits()
        if len(audits) == 0:
            logger.warning('no audits')
            return

        audit_id = audits[0]['_id']
        docs = get_docs_by_audit_id(audit_id, kind=CHARTER)

        for db_document in docs:
            print(db_document['filename'])

            parsed_p_json = db_document['parse']
            charter: CharterDocument = join_paragraphs(
                parsed_p_json, doc_id=db_document['_id'])

            # TODO: mind, this could be slow if embedding is required
            parser.find_org_date_number(charter, AuditContext())

            for tag in charter.get_tags():
                print(tag)
Beispiel #18
0
 def _preprocess_single_doc(self, kind):
     for doc in self._get_doc_from_db(kind):
         d = DbJsonDoc(doc)
         processor = document_processors.get(kind)
         processor.preprocess(d, AuditContext())