Ejemplo n.º 1
0
  def test_map_tokens_in_range(self):
    text = '1.2. мама   ಶ್ರೀರಾಮ'
    tm = TextMap(text)

    tokens = tm.tokens_by_range([0, 2])
    self.assertEqual(len(tokens), 2)
    self.assertEqual(tokens[0], '1.2.')
    self.assertEqual(tokens[1], 'мама')
Ejemplo n.º 2
0
def line_features(tokens_map: TextMap, line_span: (int, int), line_number: int,
                  prev_features):
    tokens: Tokens = tokens_map.tokens_by_range(line_span)
    # TODO: add previous and next lines features
    txt: str = tokens_map.text_range(line_span)

    numbers, span, k, s = get_tokenized_line_number(tokens, 0)
    if not numbers:
        numbers = []
        number_minor = -2
        number_major = -2
    else:
        number_minor = numbers[-1]
        number_major = numbers[0]

    header_id = ' '.join(tokens[span[1]:])
    header_id = header_id.lower()

    all_upper = header_id.upper() == header_id

    features = {
        'line_number': line_number,
        # 'popular': _onehot(header_id in popular_headers),
        # 'cr_count': txt.count('\r'),
        'has_contract': _onehot(txt.lower().find('договор')),
        'has_article': _onehot(txt.lower().find('статья')),
        'all_uppercase': _onehot(all_upper),
        'len_tokens': len(tokens),
        'len_chars': len(txt),
        'number_level': len(numbers),
        'number_minor': number_minor,
        'number_major': number_major,
        'number_roman': _onehot(s),
        'endswith_dot': _onehot(txt.rstrip().endswith('.')),
        'endswith_comma': _onehot(txt.rstrip().endswith(',')),
        'endswith_underscore': _onehot(txt.rstrip().endswith('_')),

        # counts
        'dots': header_id.count('.'),
        'tabs': txt.count('\t'),
        'spaces_inside': txt.strip().count(' '),
        'spaces_all': txt.count(' '),
        'commas': header_id.count(','),
        'brackets': _count_strange_symbols(txt, '(){}[]'),
        'dashes': header_id.count('-'),
        'colons': header_id.count(':'),
        'semicolons': header_id.count(';'),
        'strange_symbols': _count_strange_symbols(header_id, '[$@+]?^&'),
        'capitals': _count_capitals(txt),
        'digits': _count_digits(header_id),
        'quotes': _count_strange_symbols(txt, '«»"\'"'),
        'underscores': _count_strange_symbols(txt, '_')
    }

    # if prev_features is None:
    #   # features['prev-number_level'] = 0
    #   features['prev-len_chars']=-1
    # else:
    #   # features['prev-number_level'] = prev_features['number_level']
    #   features['prev-len_chars'] = prev_features['len_chars']

    return features