def find_most_relevant_paragraphs(section: TextMap, attention_vector: FixedVector, min_len: int = 20, return_delimiters=True, threshold=0.45): _blur = int(HyperParameters.subject_paragraph_attention_blur) _padding = int(_blur * 2 + 1) paragraph_attention_vector = smooth_safe( np.pad(attention_vector, _padding, mode='constant'), _blur)[_padding:-_padding] paragraph_attention_vector = relu(paragraph_attention_vector, threshold) top_indices = [ i for i, v in enumerate(paragraph_attention_vector) if v > 0.00001 ] spans = [] for i in top_indices: span = section.sentence_at_index(i, return_delimiters) if min_len is not None and span[1] - span[0] < min_len: if not span in spans: spans.append(span) return spans, paragraph_attention_vector
def test_sentence_at_index_return_delimiters(self): tm = TextMap('стороны Заключили\n договор ПРЕДМЕТ \nДОГОВОРА') for i in range(len(tm)): print(i, tm[i]) bounds = tm.sentence_at_index(0) print(bounds) print(tm.text_range(bounds)) for i in range(0, 3): bounds = tm.sentence_at_index(i) self.assertEqual('стороны Заключили\n', tm.text_range(bounds), str(i)) for i in range(3, 5): bounds = tm.sentence_at_index(i) self.assertEqual('договор ПРЕДМЕТ \n', tm.text_range(bounds)) for i in range(6, 7): bounds = tm.sentence_at_index(i) self.assertEqual('ДОГОВОРА', tm.text_range(bounds))