def test_doc_parser(self): db = get_mongodb_connection() if db is None: # TODO: this is a weird way of detecting we're on CI return FILENAME = "/Users/artem/work/nemo/goil/IN/Другие договоры/Договор Формула.docx" wp = WordDocParser() res = wp.read_doc(FILENAME) doc: LegalDocument = LegalDocument('') doc.parse() last = 0 for d in res['documents']: for p in d['paragraphs']: header_text = p['paragraphHeader']['text'] + '\n' body_text = p['paragraphBody']['text'] + '\n' header = LegalDocument(header_text) header.parse() # self.assertEqual(self.n(header_text), header.text) doc += header headerspan = (last, len(doc.tokens_map)) print(headerspan) last = len(doc.tokens_map) body = LegalDocument(body_text) body.parse() doc += body bodyspan = (last, len(doc.tokens_map)) header_tag = SemanticTag('headline', header_text, headerspan) body_tag = SemanticTag('paragraphBody', None, bodyspan) print(header_tag) # print(body_tag) para = Paragraph(header_tag, body_tag) doc.paragraphs.append(para) last = len(doc.tokens_map) h_subdoc = doc.subdoc_slice(para.header.as_slice()) b_subdoc = doc.subdoc_slice(para.body.as_slice()) # self.assertEqual(self.n(header_text), h_subdoc.text) # self.assertEqual(self.n(body_text), b_subdoc.text) print('-' * 100) print(doc.text) headers = [ doc.subdoc_slice(p.header.as_slice()) for p in doc.paragraphs ] print('-' * 100)
def nn_get_subject(textmap: TextMap, semantic_map: DataFrame, subj_1hot) -> SemanticTag: predicted_subj_name, confidence, _ = decode_subj_prediction(subj_1hot) tag = SemanticTag('subject', predicted_subj_name.name, span=None) tag.confidence = confidence tag_ = nn_get_tag_value('subject', textmap, semantic_map) if tag_ is not None: tag.span = tag_.span return tag
def nn_get_tag_value(tagname: str, textmap: TextMap, semantic_map: DataFrame, threshold=0.3) -> SemanticTag or None: att = semantic_map[tagname].values slices = find_top_spans(att, threshold=threshold, limit=1) # TODO: estimate per-tag thresholds if len(slices) > 0: span = slices[0].start, slices[0].stop value = textmap.text_range(span) tag = SemanticTag(tagname, value, span) tag.confidence = float(att[slices[0]].mean()) return tag return None
def find_document_date(doc: LegalDocument, tagname='date') -> SemanticTag or None: head: LegalDocument = get_doc_head(doc) c_span, _date = find_date(head.text) if c_span is None: return None span = head.tokens_map.token_indices_by_char_range(c_span) return SemanticTag(tagname, _date, span)
def find_document_number_in_subdoc(doc: LegalDocument, tagname='number', parent=None) -> [SemanticTag]: ret = [] findings = re.finditer(document_number_c, doc.text) if findings: for finding in findings: _number = finding['number'] if is_number_valid(_number): span = doc.tokens_map.token_indices_by_char_range( finding.span()) tag = SemanticTag(tagname, _number, span, parent=parent) tag.offset(doc.start) ret.append(tag) else: print('invalid', _number) return ret
def test_contract_analyze(self): doc, factory, ctx = self._get_doc_factory_ctx() doc.__dict__['number'] = None # hack for old pickles doc.__dict__['date'] = None # hack for old pickles doc.__dict__['attributes_tree'] = ContractSchema( ) # hack for old pickles ctx.find_attributes(doc, AuditContext()) tags: [SemanticTag] = doc.get_tags() _tag = SemanticTag.find_by_kind(tags, ContractTags.Value.display_string) quote = doc.tokens_map.text_range(_tag.span) self.assertEqual('80000,00', quote) _tag = SemanticTag.find_by_kind(tags, ContractTags.Currency.display_string) quote = doc.tokens_map.text_range(_tag.span) self.assertEqual('рублей', quote)
def find_document_number(doc: LegalDocument, tagname='number') -> SemanticTag or None: head: LegalDocument = get_doc_head(doc) _number, finding_span = find_document_number_span(head.text) if _number is not None: span = head.tokens_map.token_indices_by_char_range(finding_span) return SemanticTag(tagname, _number, span) return None
def find_charter_org(charter: LegalDocument) -> [SemanticTag]: """ TODO: see also find_protocol_org :param charter: :return: """ ret = [] x: [SemanticTag] = find_org_names( charter[0:HyperParameters.protocol_caption_max_size_words], max_names=1) nm = SemanticTag.find_by_kind(x, 'org-1-name') if nm is not None: ret.append(nm) else: charter.warn(ParserWarnings.org_name_not_found) tp = SemanticTag.find_by_kind(x, 'org-1-type') if tp is not None: ret.append(tp) else: charter.warn(ParserWarnings.org_type_not_found) return ret
def asLegalDoc(self): if self.is_analyzed(): # attributes are bound to an existing tokens map # --> preserve saved tokenization doc = create_doc_by_type(self.parse['documentType'], self._id, filename=self.filename) doc.tokens_map_norm = self.get_tokens_for_embedding() doc.tokens_map = self.get_tokens_map_unchaged() if 'sentence_map' in doc.__dict__: doc.sentence_map = self.get_sentence_map() if doc.sentence_map is None: doc.split_into_sentenses() headers = self.analysis.get('headers', None) if headers is not None: doc.paragraphs = [] last = len(doc.tokens_map) for i, h in enumerate(headers): header_tag = SemanticTag('headline', h['value'], h['span']) body_end = last if i < len(headers) - 1: body_end = headers[i + 1]['span'][0] bodyspan = header_tag.span[1] + 1, body_end body_tag = SemanticTag('paragraphBody', None, bodyspan) para = Paragraph(header_tag, body_tag) doc.paragraphs.append(para) else: # re-combine parser data doc = join_paragraphs(self.parse, self._id, filename=self.filename) pass doc.user = self.user return doc
def find_org_names_raw_by_re(doc: LegalDocument, regex, confidence_base: float, parent=None, decay_confidence=True) -> [ContractAgent]: all_: [ContractAgent] = [] iter = [m for m in re.finditer(regex, doc.text)] for m in iter: ca = ContractAgent() all_.append(ca) for re_kind in org_pieces: # like 'type', 'name', 'human_name', 'alt_name', 'alias' ... try: char_span = m.span(re_kind) if span_len(char_span) > 1: span = doc.tokens_map.token_indices_by_char_range(char_span) confidence = confidence_base if decay_confidence: confidence *= (1.0 - (span[0] / len(doc))) kind = re_kind if re_kind == 'human_name': kind = 'name' val = doc.tokens_map.text_range(span) val = val.strip() if _is_valid(val): tag = SemanticTag(kind, val, span, parent=parent) tag.confidence = confidence tag.offset(doc.start) ca.__dict__[kind] = tag except IndexError: pass # normalize org_name names by find_closest_org_name for ca in all_: normalize_contract_agent(ca) return all_
def as_tag(self): st = SemanticTag(self.type, None, (self.subdoc.start, self.body.end)) st.confidence = self.confidence return st
def tag_val(name): tag = SemanticTag.find_by_kind(tags, name) if tag is not None: return tag.value