예제 #1
0
  def test_slice_start_from_space(self):
    offff = 20
    txt = ' ' * offff + '''основании Устава, с одной стороны, и Фонд «Благо»'''
    tm = TextMap(txt)
    print(tm.map[0])
    print(tm.tokens[11])
    print(tm.map[11])
    # print(f'[{doc.tokens_map.text}]')
    print(f'[{tm.text}]')

    print(len(tm))
    tm_sliced = tm.slice(slice(0, len(tm)))
    print('span-0')
    print(tm.map[0])
    print(tm_sliced.map[0])

    self.assertEqual(len(tm), len(tm_sliced))
    self.assertEqual(tm.map[0], tm_sliced.map[0])

    for c in range(len(tm.tokens[0])):
      print(c)
      self.assertEqual(0, tm.token_index_by_char(c))
      self.assertEqual(0, tm_sliced.token_index_by_char(c))

    self.assertEqual(tm.text, tm_sliced.text)

    self.assertEqual(0, tm.token_index_by_char(0))
예제 #2
0
  def test_map_text_range(self):
    text = """1.2. мама   молилась ಶ್ರೀರಾಮ\n\nРама -- Вишну, А Вишну 
    ел... черешню? (черешня по 10 руб. 20 коп.) '' """

    tm = TextMap(text)
    t = tm.text_range([0, 3])
    self.assertEqual(t, '1.2. мама   молилась')
예제 #3
0
  def test_map_tokens_in_range(self):
    text = '1.2. мама   ಶ್ರೀರಾಮ'
    tm = TextMap(text)

    tokens = tm.tokens_by_range([0, 2])
    self.assertEqual(len(tokens), 2)
    self.assertEqual(tokens[0], '1.2.')
    self.assertEqual(tokens[1], 'мама')
예제 #4
0
  def test_split_span_add_delimiters(self):
    text = '1 2 3\nмама\nಶ್ರೀರಾಮ'
    tm = TextMap(text)

    spans = [s for s in tm.split_spans('\n', add_delimiter=True)]
    for k in spans:
      print(tm.text_range(k))

    self.assertEqual('1 2 3\n', tm.text_range(spans[0]))
예제 #5
0
  def test_concat_TextMap3(self):

    tm1 = TextMap('text prefix \n')
    tm2 = TextMap('more words')

    N = 10
    expected_tokens = len(tm1.tokens) + N * len(tm2.tokens)
    for _ in range(N):
      tm1 += tm2

    self.assertEqual(expected_tokens, len(tm1))
예제 #6
0
  def test_find_value_sign_c(self):

    for (sign_expected, price, currency, _, text) in data:
      tm = TextMap(text)
      sign, span = find_value_sign(tm)
      if sign_expected:
        self.assertEqual(sign_expected, sign, text)
      quote = ''
      if span:
        quote = tm.text_range(span)
      print(f'{sign},\t {span},\t {quote}')
예제 #7
0
  def test_char_range(self):
    text = 'этилен мама ಶ್ರೀರಾಮ'
    tm = TextMap(text)
    cr = tm.char_range([0, 1])
    self.assertEqual('этилен', text[cr[0]:cr[1]])

    cr = tm.char_range([2, None])
    self.assertEqual('ಶ್ರೀರಾಮ', text[cr[0]:cr[1]])

    cr = tm.char_range([None, 1])
    self.assertEqual('этилен', text[cr[0]:cr[1]])
예제 #8
0
    def embedd_contextualized_patterns(self, patterns, trim_padding=True):

        tokenized_sentences_list: [Tokens] = []
        regions = []

        i = 0
        maxlen = 0
        lens = []
        for (ctx_prefix, pattern, ctx_postfix) in patterns:
            # sentence = ' '.join((ctx_prefix, pattern, ctx_postfix))

            prefix_tokens = TextMap(
                ctx_prefix).tokens  # tokenize_text(ctx_prefix)
            pattern_tokens = TextMap(pattern).tokens
            suffix_tokens = TextMap(ctx_postfix).tokens

            start = len(prefix_tokens)
            end = start + len(pattern_tokens)

            sentence_tokens = prefix_tokens + pattern_tokens + suffix_tokens

            regions.append((start, end))
            tokenized_sentences_list.append(sentence_tokens)
            lens.append(len(sentence_tokens))
            if len(sentence_tokens) > maxlen:
                maxlen = len(sentence_tokens)

            i = i + 1

        _strings = []

        for s in tokenized_sentences_list:
            s.extend([' '] * (maxlen - len(s)))
            _strings.append(s)
        _strings = np.array(_strings)

        # ======== call TENSORFLOW -----==================
        sentences_emb = self.embedd_tokenized_text(_strings, lens)
        # ================================================

        patterns_emb = []

        if trim_padding:
            for i, (start, end) in enumerate(regions):
                sentence_emb = sentences_emb[i]
                pattern_emb = sentence_emb[start:end]

                patterns_emb.append(pattern_emb)
            patterns_emb = np.array(patterns_emb)
        else:
            patterns_emb = sentences_emb

        return patterns_emb, regions
예제 #9
0
  def test_finditer(self):
    from analyser.transaction_values import _re_greather_then
    text = """стоимость, равную или превышающую 2000000 ( два миллиона ) долларов сша, но менее"""
    tm = TextMap(text)
    iter = tm.finditer(_re_greather_then)

    results = [t for t in iter]
    results = results[0]

    self.assertEqual('превышающую', tm.text_range(results))
    self.assertEqual(4, results[0])
    self.assertEqual(5, results[1])
예제 #10
0
  def test_concat_TextMap(self):

    tm1 = TextMap('a')
    tm2 = TextMap('b')

    tm1 += tm2
    self.assertEqual('ab', tm1.text)
    self.assertEqual('a', tm1.tokens[0])
    self.assertEqual('b', tm1.tokens[1])

    self.assertEqual(2, len(tm1))
    self.assertEqual(1, len(tm2))
예제 #11
0
  def test_token_indices_by_char_range(self):
    text = 'мама'
    span = [0, 4]
    expected = text[span[0]:span[1]]
    print(expected)

    tm = TextMap(text)  # tokenization
    ti = tm.token_indices_by_char_range(span)
    self.assertEqual(0, ti[0])
    self.assertEqual(1, ti[1])

    self.assertEqual(expected, tm.text_range(ti))
예제 #12
0
  def test_concat_TextMap2(self):

    tm1 = TextMap('alpha \n')
    tm2 = TextMap('bet')

    tm1 += tm2

    self.assertEqual(3, len(tm1))
    self.assertEqual(1, len(tm2))

    self.assertEqual('alpha \nbet', tm1.text)
    self.assertEqual('alpha', tm1.tokens[0])
    self.assertEqual('bet', tm1.tokens[2])
예제 #13
0
def find_most_relevant_paragraphs(section: TextMap,
                                  attention_vector: FixedVector,
                                  min_len: int = 20,
                                  return_delimiters=True,
                                  threshold=0.45):
    _blur = int(HyperParameters.subject_paragraph_attention_blur)
    _padding = int(_blur * 2 + 1)

    paragraph_attention_vector = smooth_safe(
        np.pad(attention_vector, _padding, mode='constant'),
        _blur)[_padding:-_padding]

    paragraph_attention_vector = relu(paragraph_attention_vector, threshold)

    top_indices = [
        i for i, v in enumerate(paragraph_attention_vector) if v > 0.00001
    ]
    spans = []
    for i in top_indices:
        span = section.sentence_at_index(i, return_delimiters)
        if min_len is not None and span[1] - span[0] < min_len:
            if not span in spans:
                spans.append(span)

    return spans, paragraph_attention_vector
예제 #14
0
  def test_tokens_in_range_start_from_space(self):
    text = ' мама'
    tm = TextMap(text)

    self.assertEqual(1, tm.map[0][0])
    self.assertEqual(0, tm.token_index_by_char(0))

    txt = ' ' * 20 + '''основании Устава, с одной стороны, и Фонд «Благо»'''
    # tm = TextMap(txt)
    doc = LegalDocument(txt).parse()
    tm = doc.tokens_map
    print(tm.map[0])
    print(tm.tokens[11])
    print(tm.map[11])
    print(f'[{doc.tokens_map.text}]')
    print(f'[{doc.text}]')
예제 #15
0
  def test_token_indices_by_char_range_sliced(self):
    text = 'm йe qwert'

    __tm = TextMap(text)  # tokenization
    tm = __tm.slice(slice(1, 2))

    self.assertEqual('йe', tm.tokens[0])
    self.assertEqual('йe', tm.text)

    char_range = tm.char_range([0, 1])
    ti = tm.token_indices_by_char_range(char_range)
    self.assertEqual(0, ti[0])
    self.assertEqual(1, ti[1])

    ti = tm.token_indices_by_char_range([1, 2])
    self.assertEqual(0, ti[0])
예제 #16
0
  def test_slice(self):
    text = 'этилен мама   ಶ್ರೀರಾಮ'
    tm = TextMap(text)
    tm2: TextMap = tm.slice(slice(1, 2))

    self.assertEqual(tm2[0], 'мама')
    self.assertEqual(tm2.text, 'мама')

    tm3 = tm2.slice(slice(0, 1))
    self.assertEqual(tm3[0], 'мама')

    self.assertEqual(0, tm.token_index_by_char(1))
    self.assertEqual(0, tm2.token_index_by_char(1))
    self.assertEqual(0, tm3.token_index_by_char(1))

    self.assertEqual('мама', tm3.text)
    self.assertEqual('мама', tm3.text_range([0, 1]))
    self.assertEqual('мама', tm3.text_range([0, 2]))
예제 #17
0
  def test_sentence_at_index_return_delimiters(self):

    tm = TextMap('стороны Заключили\n  договор  ПРЕДМЕТ \nДОГОВОРА')
    for i in range(len(tm)):
      print(i, tm[i])

    bounds = tm.sentence_at_index(0)
    print(bounds)
    print(tm.text_range(bounds))
    for i in range(0, 3):
      bounds = tm.sentence_at_index(i)
      self.assertEqual('стороны Заключили\n', tm.text_range(bounds), str(i))

    for i in range(3, 5):
      bounds = tm.sentence_at_index(i)
      self.assertEqual('договор  ПРЕДМЕТ \n', tm.text_range(bounds))

    for i in range(6, 7):
      bounds = tm.sentence_at_index(i)
      self.assertEqual('ДОГОВОРА', tm.text_range(bounds))
예제 #18
0
  def test_normalize_basics(self):
    cn = CaseNormalizer()
    tm = TextMap('стороны Заключили (ХОРОШИЙ)договор, (уррраа!!) ПРЕДМЕТ ДОГОВОРА')

    tm2 = cn.normalize_tokens_map_case(tm)

    self.assertEqual(tm.map, tm2.map)
    self.assertEqual(tm2[1], 'заключили')
    self.assertEqual(tm2[12], 'Предмет')

    for i in range(len(tm)):
      self.assertEqual(tm2[i].lower(), tm[i].lower())
예제 #19
0
  def test_tokens_in_range(self):
    text = 'мама'
    tm = TextMap(text)

    self.assertEqual(0, tm.token_index_by_char(0))
    self.assertEqual(0, tm.token_index_by_char(1))
    self.assertEqual(0, tm.token_index_by_char(2))
    self.assertEqual(0, tm.token_index_by_char(3))

    text = 'мама выла папу'
    tm = TextMap(text)

    self.assertEqual(1, tm.token_index_by_char(5))
    self.assertEqual(1, tm.token_index_by_char(6))
    self.assertEqual(1, tm.token_index_by_char(7))
    self.assertEqual(1, tm.token_index_by_char(8))

    self.assertEqual(2, tm.token_index_by_char(9))
    self.assertEqual(1, tm.token_index_by_char(4))
예제 #20
0
def nn_get_tag_value(tagname: str,
                     textmap: TextMap,
                     semantic_map: DataFrame,
                     threshold=0.3) -> SemanticTag or None:
    att = semantic_map[tagname].values
    slices = find_top_spans(att, threshold=threshold,
                            limit=1)  # TODO: estimate per-tag thresholds

    if len(slices) > 0:
        span = slices[0].start, slices[0].stop
        value = textmap.text_range(span)
        tag = SemanticTag(tagname, value, span)
        tag.confidence = float(att[slices[0]].mean())
        return tag
    return None
예제 #21
0
  def test_get_by_index(self):

    ಶ್ರೀರಾಮ = self

    ಮ = 'ቋንቋ የድምጽ፣ የምልክት ወይም የምስል ቅንብር ሆኖ ለማሰብ'
    ቅ = TextMap(ಮ)
    ಶ್ರೀರಾಮ.assertEqual(ቅ[0], 'ቋንቋ')
    ಶ್ರೀರಾಮ.assertEqual(ቅ[1], 'የድምጽ፣')
    ಶ್ರೀರಾಮ.assertEqual(ቅ[2], 'የምልክት')

    # test iteration
    for x in ቅ:
      print(x)

    # test slicing
    print(ቅ[0:2])
예제 #22
0
def doc_features(tokens_map: TextMap):
    body_lines_ranges = tokens_map.split_spans(PARAGRAPH_DELIMITER,
                                               add_delimiter=True)

    _doc_features = []
    _line_spans = []
    ln = 0
    _prev_features = None
    for line_span in body_lines_ranges:
        _line_spans.append(line_span)

        _features = line_features(tokens_map, line_span, ln, _prev_features)
        _doc_features.append(_features)
        _prev_features = _features
        ln += 1
    doc_featuresX_data = pd.DataFrame.from_records(_doc_features)
    doc_features_data = np.array(doc_featuresX_data)

    return doc_features_data, _line_spans
예제 #23
0
  def test_concat_then_slice(self):
    text1 = 'этилен мама'
    text2 = 'этилен папа'

    tm0 = TextMap('')
    tm1 = TextMap(text1)
    tm2 = TextMap(text2)

    tm0 += tm1
    tm0 += tm2

    print(tm1.tokens)
    self.assertEqual(text1 + text2, tm0.text)
    self.assertEqual('мамаэтилен', tm0.text_range([1, 3]))

    tm3 = tm0.slice(slice(1, 3))
    self.assertEqual('мамаэтилен', tm3.text)
예제 #24
0
 def get_sentence_map(self) -> TextMap:
     if 'sentences' in self.analysis['tokenization_maps']:
         _map = self.analysis['tokenization_maps']['sentences']
         tokens_map = TextMap(self.analysis['normal_text'], _map)
         return tokens_map
예제 #25
0
 def test_find_value_sign_b(self):
   text = """стоимость, равную или превышающую 2000000 ( два миллиона ) долларов сша, но менее"""
   tm = TextMap(text)
   sign, span = find_value_sign(tm)
   quote = tm.text_range(span)
   self.assertEqual('менее', quote)
예제 #26
0
  def test_PARAGRAPH_DELIMITER(self):

    tm = TextMap('a' + PARAGRAPH_DELIMITER + 'b')
    print(tm.tokens)
    self.assertEqual(3, len(tm))
예제 #27
0
  def test_get_len(self):
    text = 'а б с'
    tm = TextMap(text)

    self.assertEqual(3, len(tm))
예제 #28
0
 def test_get_tokens(self):
   text = 'ಉಂದು ಅರ್ತೊಪೂರ್ಣೊ ವಾಕ್ಯೊಲೆನ್ ಕೊರ್ಪುನ ಸಾಮರ್ತ್ಯೊನು ಹೊಂದೊಂತ್ '
   tm = TextMap(text)
   print(tm.tokens)
   self.assertEqual(6, len(tm.tokens))
예제 #29
0
 def test_tokenize_numbered(self):
   text = '1. этилен мама, этилен!'
   tm = TextMap(text)
   self.assertEqual(tm.tokens[0], '1.')
   self.assertEqual(tm.tokens[1], 'этилен')
예제 #30
0
  def test_split(self):
    text = '1 2 3\nмама\nಶ್ರೀರಾಮ'
    tm = TextMap(text)

    for k in tm.split('\n'):
      print(k)