Example #1
0
  def test_split_span_add_delimiters(self):
    text = '1 2 3\nмама\nಶ್ರೀರಾಮ'
    tm = TextMap(text)

    spans = [s for s in tm.split_spans('\n', add_delimiter=True)]
    for k in spans:
      print(tm.text_range(k))

    self.assertEqual('1 2 3\n', tm.text_range(spans[0]))
Example #2
0
def doc_features(tokens_map: TextMap):
    body_lines_ranges = tokens_map.split_spans(PARAGRAPH_DELIMITER,
                                               add_delimiter=True)

    _doc_features = []
    _line_spans = []
    ln = 0
    _prev_features = None
    for line_span in body_lines_ranges:
        _line_spans.append(line_span)

        _features = line_features(tokens_map, line_span, ln, _prev_features)
        _doc_features.append(_features)
        _prev_features = _features
        ln += 1
    doc_featuresX_data = pd.DataFrame.from_records(_doc_features)
    doc_features_data = np.array(doc_featuresX_data)

    return doc_features_data, _line_spans