def test_slice_start_from_space(self): offff = 20 txt = ' ' * offff + '''основании Устава, с одной стороны, и Фонд «Благо»''' tm = TextMap(txt) print(tm.map[0]) print(tm.tokens[11]) print(tm.map[11]) # print(f'[{doc.tokens_map.text}]') print(f'[{tm.text}]') print(len(tm)) tm_sliced = tm.slice(slice(0, len(tm))) print('span-0') print(tm.map[0]) print(tm_sliced.map[0]) self.assertEqual(len(tm), len(tm_sliced)) self.assertEqual(tm.map[0], tm_sliced.map[0]) for c in range(len(tm.tokens[0])): print(c) self.assertEqual(0, tm.token_index_by_char(c)) self.assertEqual(0, tm_sliced.token_index_by_char(c)) self.assertEqual(tm.text, tm_sliced.text) self.assertEqual(0, tm.token_index_by_char(0))
def test_map_text_range(self): text = """1.2. мама молилась ಶ್ರೀರಾಮ\n\nРама -- Вишну, А Вишну ел... черешню? (черешня по 10 руб. 20 коп.) '' """ tm = TextMap(text) t = tm.text_range([0, 3]) self.assertEqual(t, '1.2. мама молилась')
def test_map_tokens_in_range(self): text = '1.2. мама ಶ್ರೀರಾಮ' tm = TextMap(text) tokens = tm.tokens_by_range([0, 2]) self.assertEqual(len(tokens), 2) self.assertEqual(tokens[0], '1.2.') self.assertEqual(tokens[1], 'мама')
def test_split_span_add_delimiters(self): text = '1 2 3\nмама\nಶ್ರೀರಾಮ' tm = TextMap(text) spans = [s for s in tm.split_spans('\n', add_delimiter=True)] for k in spans: print(tm.text_range(k)) self.assertEqual('1 2 3\n', tm.text_range(spans[0]))
def test_concat_TextMap3(self): tm1 = TextMap('text prefix \n') tm2 = TextMap('more words') N = 10 expected_tokens = len(tm1.tokens) + N * len(tm2.tokens) for _ in range(N): tm1 += tm2 self.assertEqual(expected_tokens, len(tm1))
def test_find_value_sign_c(self): for (sign_expected, price, currency, _, text) in data: tm = TextMap(text) sign, span = find_value_sign(tm) if sign_expected: self.assertEqual(sign_expected, sign, text) quote = '' if span: quote = tm.text_range(span) print(f'{sign},\t {span},\t {quote}')
def test_char_range(self): text = 'этилен мама ಶ್ರೀರಾಮ' tm = TextMap(text) cr = tm.char_range([0, 1]) self.assertEqual('этилен', text[cr[0]:cr[1]]) cr = tm.char_range([2, None]) self.assertEqual('ಶ್ರೀರಾಮ', text[cr[0]:cr[1]]) cr = tm.char_range([None, 1]) self.assertEqual('этилен', text[cr[0]:cr[1]])
def embedd_contextualized_patterns(self, patterns, trim_padding=True): tokenized_sentences_list: [Tokens] = [] regions = [] i = 0 maxlen = 0 lens = [] for (ctx_prefix, pattern, ctx_postfix) in patterns: # sentence = ' '.join((ctx_prefix, pattern, ctx_postfix)) prefix_tokens = TextMap( ctx_prefix).tokens # tokenize_text(ctx_prefix) pattern_tokens = TextMap(pattern).tokens suffix_tokens = TextMap(ctx_postfix).tokens start = len(prefix_tokens) end = start + len(pattern_tokens) sentence_tokens = prefix_tokens + pattern_tokens + suffix_tokens regions.append((start, end)) tokenized_sentences_list.append(sentence_tokens) lens.append(len(sentence_tokens)) if len(sentence_tokens) > maxlen: maxlen = len(sentence_tokens) i = i + 1 _strings = [] for s in tokenized_sentences_list: s.extend([' '] * (maxlen - len(s))) _strings.append(s) _strings = np.array(_strings) # ======== call TENSORFLOW -----================== sentences_emb = self.embedd_tokenized_text(_strings, lens) # ================================================ patterns_emb = [] if trim_padding: for i, (start, end) in enumerate(regions): sentence_emb = sentences_emb[i] pattern_emb = sentence_emb[start:end] patterns_emb.append(pattern_emb) patterns_emb = np.array(patterns_emb) else: patterns_emb = sentences_emb return patterns_emb, regions
def test_finditer(self): from analyser.transaction_values import _re_greather_then text = """стоимость, равную или превышающую 2000000 ( два миллиона ) долларов сша, но менее""" tm = TextMap(text) iter = tm.finditer(_re_greather_then) results = [t for t in iter] results = results[0] self.assertEqual('превышающую', tm.text_range(results)) self.assertEqual(4, results[0]) self.assertEqual(5, results[1])
def test_concat_TextMap(self): tm1 = TextMap('a') tm2 = TextMap('b') tm1 += tm2 self.assertEqual('ab', tm1.text) self.assertEqual('a', tm1.tokens[0]) self.assertEqual('b', tm1.tokens[1]) self.assertEqual(2, len(tm1)) self.assertEqual(1, len(tm2))
def test_token_indices_by_char_range(self): text = 'мама' span = [0, 4] expected = text[span[0]:span[1]] print(expected) tm = TextMap(text) # tokenization ti = tm.token_indices_by_char_range(span) self.assertEqual(0, ti[0]) self.assertEqual(1, ti[1]) self.assertEqual(expected, tm.text_range(ti))
def test_concat_TextMap2(self): tm1 = TextMap('alpha \n') tm2 = TextMap('bet') tm1 += tm2 self.assertEqual(3, len(tm1)) self.assertEqual(1, len(tm2)) self.assertEqual('alpha \nbet', tm1.text) self.assertEqual('alpha', tm1.tokens[0]) self.assertEqual('bet', tm1.tokens[2])
def find_most_relevant_paragraphs(section: TextMap, attention_vector: FixedVector, min_len: int = 20, return_delimiters=True, threshold=0.45): _blur = int(HyperParameters.subject_paragraph_attention_blur) _padding = int(_blur * 2 + 1) paragraph_attention_vector = smooth_safe( np.pad(attention_vector, _padding, mode='constant'), _blur)[_padding:-_padding] paragraph_attention_vector = relu(paragraph_attention_vector, threshold) top_indices = [ i for i, v in enumerate(paragraph_attention_vector) if v > 0.00001 ] spans = [] for i in top_indices: span = section.sentence_at_index(i, return_delimiters) if min_len is not None and span[1] - span[0] < min_len: if not span in spans: spans.append(span) return spans, paragraph_attention_vector
def test_tokens_in_range_start_from_space(self): text = ' мама' tm = TextMap(text) self.assertEqual(1, tm.map[0][0]) self.assertEqual(0, tm.token_index_by_char(0)) txt = ' ' * 20 + '''основании Устава, с одной стороны, и Фонд «Благо»''' # tm = TextMap(txt) doc = LegalDocument(txt).parse() tm = doc.tokens_map print(tm.map[0]) print(tm.tokens[11]) print(tm.map[11]) print(f'[{doc.tokens_map.text}]') print(f'[{doc.text}]')
def test_token_indices_by_char_range_sliced(self): text = 'm йe qwert' __tm = TextMap(text) # tokenization tm = __tm.slice(slice(1, 2)) self.assertEqual('йe', tm.tokens[0]) self.assertEqual('йe', tm.text) char_range = tm.char_range([0, 1]) ti = tm.token_indices_by_char_range(char_range) self.assertEqual(0, ti[0]) self.assertEqual(1, ti[1]) ti = tm.token_indices_by_char_range([1, 2]) self.assertEqual(0, ti[0])
def test_slice(self): text = 'этилен мама ಶ್ರೀರಾಮ' tm = TextMap(text) tm2: TextMap = tm.slice(slice(1, 2)) self.assertEqual(tm2[0], 'мама') self.assertEqual(tm2.text, 'мама') tm3 = tm2.slice(slice(0, 1)) self.assertEqual(tm3[0], 'мама') self.assertEqual(0, tm.token_index_by_char(1)) self.assertEqual(0, tm2.token_index_by_char(1)) self.assertEqual(0, tm3.token_index_by_char(1)) self.assertEqual('мама', tm3.text) self.assertEqual('мама', tm3.text_range([0, 1])) self.assertEqual('мама', tm3.text_range([0, 2]))
def test_sentence_at_index_return_delimiters(self): tm = TextMap('стороны Заключили\n договор ПРЕДМЕТ \nДОГОВОРА') for i in range(len(tm)): print(i, tm[i]) bounds = tm.sentence_at_index(0) print(bounds) print(tm.text_range(bounds)) for i in range(0, 3): bounds = tm.sentence_at_index(i) self.assertEqual('стороны Заключили\n', tm.text_range(bounds), str(i)) for i in range(3, 5): bounds = tm.sentence_at_index(i) self.assertEqual('договор ПРЕДМЕТ \n', tm.text_range(bounds)) for i in range(6, 7): bounds = tm.sentence_at_index(i) self.assertEqual('ДОГОВОРА', tm.text_range(bounds))
def test_normalize_basics(self): cn = CaseNormalizer() tm = TextMap('стороны Заключили (ХОРОШИЙ)договор, (уррраа!!) ПРЕДМЕТ ДОГОВОРА') tm2 = cn.normalize_tokens_map_case(tm) self.assertEqual(tm.map, tm2.map) self.assertEqual(tm2[1], 'заключили') self.assertEqual(tm2[12], 'Предмет') for i in range(len(tm)): self.assertEqual(tm2[i].lower(), tm[i].lower())
def test_tokens_in_range(self): text = 'мама' tm = TextMap(text) self.assertEqual(0, tm.token_index_by_char(0)) self.assertEqual(0, tm.token_index_by_char(1)) self.assertEqual(0, tm.token_index_by_char(2)) self.assertEqual(0, tm.token_index_by_char(3)) text = 'мама выла папу' tm = TextMap(text) self.assertEqual(1, tm.token_index_by_char(5)) self.assertEqual(1, tm.token_index_by_char(6)) self.assertEqual(1, tm.token_index_by_char(7)) self.assertEqual(1, tm.token_index_by_char(8)) self.assertEqual(2, tm.token_index_by_char(9)) self.assertEqual(1, tm.token_index_by_char(4))
def nn_get_tag_value(tagname: str, textmap: TextMap, semantic_map: DataFrame, threshold=0.3) -> SemanticTag or None: att = semantic_map[tagname].values slices = find_top_spans(att, threshold=threshold, limit=1) # TODO: estimate per-tag thresholds if len(slices) > 0: span = slices[0].start, slices[0].stop value = textmap.text_range(span) tag = SemanticTag(tagname, value, span) tag.confidence = float(att[slices[0]].mean()) return tag return None
def test_get_by_index(self): ಶ್ರೀರಾಮ = self ಮ = 'ቋንቋ የድምጽ፣ የምልክት ወይም የምስል ቅንብር ሆኖ ለማሰብ' ቅ = TextMap(ಮ) ಶ್ರೀರಾಮ.assertEqual(ቅ[0], 'ቋንቋ') ಶ್ರೀರಾಮ.assertEqual(ቅ[1], 'የድምጽ፣') ಶ್ರೀರಾಮ.assertEqual(ቅ[2], 'የምልክት') # test iteration for x in ቅ: print(x) # test slicing print(ቅ[0:2])
def doc_features(tokens_map: TextMap): body_lines_ranges = tokens_map.split_spans(PARAGRAPH_DELIMITER, add_delimiter=True) _doc_features = [] _line_spans = [] ln = 0 _prev_features = None for line_span in body_lines_ranges: _line_spans.append(line_span) _features = line_features(tokens_map, line_span, ln, _prev_features) _doc_features.append(_features) _prev_features = _features ln += 1 doc_featuresX_data = pd.DataFrame.from_records(_doc_features) doc_features_data = np.array(doc_featuresX_data) return doc_features_data, _line_spans
def test_concat_then_slice(self): text1 = 'этилен мама' text2 = 'этилен папа' tm0 = TextMap('') tm1 = TextMap(text1) tm2 = TextMap(text2) tm0 += tm1 tm0 += tm2 print(tm1.tokens) self.assertEqual(text1 + text2, tm0.text) self.assertEqual('мамаэтилен', tm0.text_range([1, 3])) tm3 = tm0.slice(slice(1, 3)) self.assertEqual('мамаэтилен', tm3.text)
def get_sentence_map(self) -> TextMap: if 'sentences' in self.analysis['tokenization_maps']: _map = self.analysis['tokenization_maps']['sentences'] tokens_map = TextMap(self.analysis['normal_text'], _map) return tokens_map
def test_find_value_sign_b(self): text = """стоимость, равную или превышающую 2000000 ( два миллиона ) долларов сша, но менее""" tm = TextMap(text) sign, span = find_value_sign(tm) quote = tm.text_range(span) self.assertEqual('менее', quote)
def test_PARAGRAPH_DELIMITER(self): tm = TextMap('a' + PARAGRAPH_DELIMITER + 'b') print(tm.tokens) self.assertEqual(3, len(tm))
def test_get_len(self): text = 'а б с' tm = TextMap(text) self.assertEqual(3, len(tm))
def test_get_tokens(self): text = 'ಉಂದು ಅರ್ತೊಪೂರ್ಣೊ ವಾಕ್ಯೊಲೆನ್ ಕೊರ್ಪುನ ಸಾಮರ್ತ್ಯೊನು ಹೊಂದೊಂತ್ ' tm = TextMap(text) print(tm.tokens) self.assertEqual(6, len(tm.tokens))
def test_tokenize_numbered(self): text = '1. этилен мама, этилен!' tm = TextMap(text) self.assertEqual(tm.tokens[0], '1.') self.assertEqual(tm.tokens[1], 'этилен')
def test_split(self): text = '1 2 3\nмама\nಶ್ರೀರಾಮ' tm = TextMap(text) for k in tm.split('\n'): print(k)