Exemple #1
0
    def test_strip_text_coords(self):
        text = '    (A) right of set-off; (B) '
        stripped = TextBeautifier.strip_string_coords(text, 100, 127)
        self.assertEqual(('(A) right of set-off; (B)', 104, 126), stripped)

        text = '    (A) right of set-off; (B) '
        stripped = TextBeautifier.lstrip_string_coords(text, 100, 127)
        self.assertEqual(('(A) right of set-off; (B) ', 104, 127), stripped)

        text = '    (A) right of set-off; (B) '
        stripped = TextBeautifier.rstrip_string_coords(text, 100, 127)
        self.assertEqual(('    (A) right of set-off; (B)', 100, 126), stripped)
Exemple #2
0
def trim_defined_term(term: str, start: int, end: int) -> \
        Tuple[str, int, int, bool]:
    """
    Remove pair of quotes / brackets framing text
    Replace N-grams of spaces with single spaces
    Replace line breaks with spaces
    :param term: a phrase that may contain excess framing symbols
    :param start: original term's start position, may be changed
    :param end: original term's end position, may be changed
    :return: updated term, start, end and the flag indicating that the whole phrase was inside quotes
    """
    was_quoted = False

    # pick text from quotes
    # pick text from quotes
    quoted_parts = [m.group() for m in QUOTED_TEXT_RE.finditer(term)]
    if len(quoted_parts) == 1:
        term = quoted_parts[0].strip('''\"'“„''')
        was_quoted = True

    orig_term_len = len(term)
    orig_term_quotes = count_sequence_matches(
        term, lambda c: c in TextBeautifier.QUOTES)
    term, start, end = TextBeautifier.strip_pair_symbols((term, start, end))
    if len(term) < orig_term_len:
        # probably we removed quotes
        updated_term_quotes = count_sequence_matches(
            term, lambda c: c in TextBeautifier.QUOTES)
        was_quoted = was_quoted or orig_term_quotes - updated_term_quotes > 1

    term = term.replace('\n', ' ')
    term = SPACES_RE.sub(' ', term)

    term, start, end = TextBeautifier.strip_string_coords(
        term, start, end, STRIP_PUNCT_SYMBOLS)

    # strip all dots or just left one (if ends with abbreviation)
    ends_with_abbr = ABBREVIATION_ENDING_RE.search(term)
    if not ends_with_abbr:
        term, start, end = TextBeautifier.strip_string_coords(
            term, start, end, '.')
    else:
        term, start, end = TextBeautifier.lstrip_string_coords(
            term, start, end, '.')

    return term, start, end, was_quoted