Ejemplo n.º 1
0
    def sanatize_paragraph(paragraph) -> list:
        """
        Entfernung von zeichen, zahlen, überzähligen Leerzeichen
        :param paragraph: Liste von Strings
        :return: Bereinigte Liste von Strings
        """
        sanatized = list()
        for ind, itm in enumerate(paragraph):
            if itm:
                itm = re.sub(r"\.", " ", itm)  # Replace dots with whitespaces
                itm = re.sub(
                    r"(?=[^ ])\P{L}", " ",
                    itm)  # Replace all non-word chars with whitespaces
                itm = re.sub(r" {2,}", " ", itm)  # Strip excess whitespaces

                if not itm or len(itm.strip()) <= 3:
                    continue
                sanatized.append(itm.strip().lower())

            if not itm and ind == 0:
                return list()
        if len(sanatized) == 1:
            return list()

        return sanatized
def format_transcript(path):
    """
    This function is for converting the transcript text to all uppercase letters and removing all special characters (except for whitespace and newline characters), as per agreed upon convention.

    Parameters:
        path (string): String variable containing the path to the transcript .txt file

    Returns:
        Creates a new .txt file containing the formatted contents of the original

    """

    src = open(path, mode='r', encoding='utf-8')
    dst = open(path[:-4] + '-formatted.txt', mode='w', encoding='utf-8')

    transcript = src.read()

    transcript_array = [
        element.split(' ', 1) for element in transcript.strip().split('\n')
    ]

    if is_indexed(transcript_array):
        transcript = re.sub('[^\\p{L} \n\\d-]', '', transcript)
    else:
        transcript = re.sub('[^\\p{L} \n]', '', transcript)

    transcript = transcript.upper()

    dst.write(transcript)

    src.close()
    dst.close()
Ejemplo n.º 3
0
    def __init__(self, necessary_paths={config.hidden_folder: ["tex_data", "cache", "log", "topics"]}):
        make_dirs_recursive(necessary_paths)
        self.G = nx.MultiDiGraph()

        for _from, _to, functional_object in ____CONVERTERS____:
            self.add_edge(_from, _to, functional_object)
            self.add_starred(_from, _to, functional_object, ____CONVERTERS____)
            functional_object.ant = self

        for (_froms1, _tos1, functional_object1), \
            (_froms2, _tos2, functional_object2) \
                in itertools.permutations(____CONVERTERS____, 2):

            for (_to1, _from1, _to2, _from2) in list_or_values(_tos1, _froms1, _tos2, _froms2):
                if _from1 == None:
                    _from1 =  OUT_OF_THE_BOX
                if _from2 == None:
                    _from2 =  OUT_OF_THE_BOX

                try:
                    if match(_to1, _from2):
                        self.add_edge(_to1, regex.sub(_from2 + '$', _to2, _to1), functional_object2)
                    if match(_to2, _from1):
                        self.add_edge(_to2, regex.sub(_from1 + '$', _to1, _to2), functional_object1)
                except Exception as e:
                    logging.error(f"_to1 = {_to1}")
                    logging.error(
                        f"failing to compare {_to1} and {_to2} and {_from1} and {_from2} as regexes because {e}")
Ejemplo n.º 4
0
 def fix_errors_in_citation(citation):
     """
     Fix some common inconsistencies in the references such as double spaces.
     """
     result = regex.sub(r"\s+", " ", citation)
     result = regex.sub(r"§(?=\d)", "§ ", result)
     result = regex.sub(r",\sbis\s", " bis ", result)
     return result
Ejemplo n.º 5
0
 def extract(
     self,
     token: str,
     current_idx: int,
     relative_idx: int,
     tokens: Sequence[str],
     features: Dict[str, float],
 ):
     shape = regex.sub(
         UPPERCASE_RE, 'X',
         regex.sub(LOWERCASE_RE, 'x', re.sub(DIGIT_RE, '0', token)))
     features["shape[" + str(relative_idx) + "]=" + shape] = 1.0
Ejemplo n.º 6
0
def remove_common_sub(domains):
    """
    Remove www. and m. subdomains
    """
    pattern = re.compile(r"^(?>www\.|m\.)")
    domains = [re.sub(pattern, "", x, concurrent=True) for x in domains]
    return set(domains)
Ejemplo n.º 7
0
def add_whitespace_after_punctuation_marks(s: Text) -> Text:
    """
    >>> add_whitespace_after_punctuation_marks('Живи еще хоть четверть века—Всё будет так.Исхода нет.')
    'Живи еще хоть четверть века— Всё будет так. Исхода нет. '
    """
    return regex.sub(r'(\p{L}+' + f'[{PUNCTUATION_MARKS_REGEX}])', r'\g<1> ',
                     s)
Ejemplo n.º 8
0
def delete_quote_links(text: str, tweet):
    if is_quote(tweet):
        text = regex.sub(get_tweet_url(tweet.quoted_status),
                         '',
                         text,
                         flags=regex.IGNORECASE)

    return text
Ejemplo n.º 9
0
 def tojson(self):
     return {
         'title': regex.sub(' +', ' ', self.title.strip()),
         'link': self.link,
         'cover': self.imageUrl,
         'details': self.details,
         'screens': self.screens,
         'links': self.links
     }
Ejemplo n.º 10
0
def extract_abp(content):
    """Extracts blocked and unblocked domains from ABP style content."""
    pattern_unsupported = re.compile(r"\S+(?>\/|\=)\S+", re.V1)
    pattern_supported_block = re.compile(
        r"^\|\|.+\^(?>$|.+(?:"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|"
        r"\bdocument\b|"
        r"\ball\b"
        # r"\ball\b|"
        # r"\bpopup\b"
        r"))",
        re.V1,
    )
    pattern_scrub_blocked_list = [
        r"^\|\|",
        r"\^($|.+(?>"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|\bdocument\b|"
        r"\ball\b|"
        r"\bpopup\b|"
        r"\S+))",
    ]
    pattern_scrub_blocked = re.compile(
        "|".join(f"(?:{p})" for p in pattern_scrub_blocked_list), re.V1
    )
    block_rules = [
        x
        for x in content
        if re.match(pattern_supported_block, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]

    blocked_domains = [
        re.sub(pattern_scrub_blocked, "", x, concurrent=True) for x in block_rules
    ]
    blocked_domains = [x for x in blocked_domains if valid_domain(x)]
    pattern_supported_unblock = re.compile(r"@@\|\|.+\^$")
    unblock_rules = [
        x
        for x in content
        if re.match(pattern_supported_unblock, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]
    unblocked_domains = [
        x.replace("@@||", "").replace("^", "").replace("$important", "")
        for x in unblock_rules
    ]
    regex_rules = []
    return blocked_domains, unblocked_domains, unblock_rules, regex_rules
Ejemplo n.º 11
0
def concat_category(out_file):
    """Concatenate category README.md files"""
    files = glob(f"{DirPath.input}/*/*.md")
    files = sorted(files, key=lambda x: x)
    files = sorted(files, key=lambda x: x.__contains__("regional"))
    files = sorted(files, key=lambda x: x.__contains__("main"), reverse=True)
    for file in files:
        with open(file, encoding="utf-8") as file_input:
            with open(out_file, "a", encoding="utf-8") as file_output:
                lines = (re.sub(r"^#", r"##", x)
                         if re.match(r"^#{0,6}+\s", x) else x
                         for x in file_input)
                file_output.writelines(lines)
Ejemplo n.º 12
0
def fix_escape_characters(text: str):
    text = text.replace('&amp;', '\&')
    text = text.replace('&lt;', '\<')
    text = text.replace('&gt;', '\>')

    # Escape Discord's markdown
    text = text.replace('`', '\`')
    text = text.replace('*', '\*')
    text = text.replace('~', '\~')

    # Special exception for underscore because Twitter user names may contain them
    text = regex.sub(r'(?<!@\S*)_', '\_', text)

    return text
Ejemplo n.º 13
0
    def _parse_infobox(self, text, title):
        result = []
        text = regex.sub(r'\n ?\|', '\n|', text)
        lines = text.split('\n|')
        for line in lines:
            date_in_text = self.find_date(line)
            if date_in_text:
                info = [x.strip() for x in line.split('=')]
                result.append(
                    Index(token=title,
                          date=date_in_text.date,
                          info=info[0].replace('\n', '')))

        return result
Ejemplo n.º 14
0
 def on_edit(self, instance, value):
     if not value:
         if self.textinput:
             self.remove_widget(self.textinput)
         return
     unformatted_text = regex.sub(self.unformat_bbcode, "", self.text)
     self.textinput = t = SelectableLabel(text=unformatted_text,
                                          size_hint=(None, None),
                                          font_size=self.font_size,
                                          font_name=self.font_name,
                                          pos=self.pos,
                                          size=self.size,
                                          multiline=False)
     self.bind(pos=t.setter('pos'), size=t.setter('size'))
     self.add_widget(self.textinput)
     t.bind(on_text_validate=self.on_text_validate,
            focus=self.on_text_focus)
Ejemplo n.º 15
0
def bills():
    data_dir = '../../lab1/data'
    for directory in os.listdir(data_dir):
        if directory.endswith('txt'):
            # print("directory: " + directory)

            bill = open(os.path.join(data_dir, directory), encoding='UTF-8').read()
            text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill)
            # print(text[:400])

            r = regex.match(
                r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)',
                text)

            if r is None:
                yield bill, "", "", "", "f"
            else:
                yield bill, r.group("title"), r.group("journal_year"), r.group("position"), directory.split('.')[0]
Ejemplo n.º 16
0
def extract_hosts(content, list_type):
    """Extracts blocked or unblocked domains from hosts/domains style content."""
    pattern_scrub = [
        r"(?>\#|\!|\s+\#|\s+\!).*",
        r"^\s",
        r".*\blocalhost\b.*",
        r"^\d*\.\d*\.\d*\.\d*\s*(?>\s|www\.|m\.)",
        r"^(?>www\.|m\.)",
    ]
    pattern = re.compile("|".join(f"(?:{p})" for p in pattern_scrub), re.V1)
    domains = [re.sub(pattern, "", x, concurrent=True) for x in content]
    domains = [x for x in domains if valid_domain(x)]
    blocked_domains, unblocked_domains = [], []
    if list_type == "unblock":
        unblocked_domains = domains
    if list_type == "block":
        blocked_domains = domains

    return blocked_domains, unblocked_domains
Ejemplo n.º 17
0
def replace_hashtag_with_link(text: str, hashtag_entities=None):
    if hashtag_entities is not None:
        hashtags_sorted = sorted(hashtag_entities,
                                 key=lambda x: x['indices'][0],
                                 reverse=True)

        for hashtag in hashtags_sorted:
            start, end = hashtag['indices']

            # text[start] is either '#' or '#', so this preserves the original character used
            hashtag_text = text[start] + hashtag['text']
            text = text[0:start] + get_named_link(
                hashtag_text, get_hashtag_url(hashtag_text)) + text[end:]
    else:
        hashtags = regex.findall(r'(?:[#|#])[^\d\W][\w]*', text)
        for hashtag in hashtags:
            text = regex.sub(
                regex.escape(hashtag),
                fr'{get_named_link(hashtag, get_hashtag_url(hashtag))}', text)

    return text
Ejemplo n.º 18
0
def replace_mention_with_link(text: str,
                              user_mentions_entities,
                              in_reply_to_screen_name: str = None):
    if not user_mentions_entities:
        return text

    for mention in user_mentions_entities:
        mention_text = '@' + mention['screen_name']

        if in_reply_to_screen_name and mention[
                'screen_name'] == in_reply_to_screen_name:
            text = regex.sub(regex.escape(mention_text),
                             '',
                             text,
                             flags=regex.IGNORECASE)
        else:
            text = text.replace(
                mention_text,
                get_named_link(
                    mention_text,
                    get_profile_url(screen_name=mention['screen_name'])))

    return text
Ejemplo n.º 19
0
    def parse_text(self):
        results = []
        self.text = regex.sub(r'&lt;ref.*\n?.*&lt;/ref&gt;',
                              repl="",
                              string=self.text)
        self.text = regex.sub(r'{\| class=\"wikitable.*\|}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)
        self.text = regex.sub(r'{{[cC]ite.*}}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)

        if self.paragraph_splitter(sep='== See also =='):
            pass
        elif self.paragraph_splitter(sep='==Notes=='):
            pass
        elif self.paragraph_splitter(sep='==References=='):
            pass
        elif self.paragraph_splitter(sep='== Bibliography =='):
            pass
        elif self.paragraph_splitter(sep='== External links =='):
            pass
        elif self.paragraph_splitter(sep='=== Sources ==='):
            pass

        sentences_reg = regex.finditer(
            r'(^| )[A-Z][^\.!?]{5,}[\.!?]',
            self.text)  # possibly [A-Z][^\.!?]{5,}[\.!?] for performance

        for sentence_it in sentences_reg:
            sentence = sentence_it.group(0)
            date_in_text = self.find_date(sentence)
            if date_in_text:
                look_before = 60
                look_after = 30
                start = date_in_text.start - look_before if date_in_text.start >= look_before else 0
                end = date_in_text.end + look_after if date_in_text.end + look_after < len(
                    sentence) else len(sentence)
                if date_in_text.end + look_after > len(sentence):
                    token = self.find_token(sentence[start:],
                                            date_in_text.start,
                                            date_in_text.end)
                else:
                    token = self.find_token(
                        sentence[start:date_in_text.end + look_after],
                        date_in_text.start, date_in_text.end)

                token_context = sentence[start:end]

                # token with full word at beginning
                i = start
                counter = 0
                while True:
                    i -= 1
                    counter += 1
                    if i < 0 or counter > 8:
                        break

                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context = sentence[i + 1:start] + token_context
                        break

                # token with full word at end
                i = end
                counter = 0
                while True:
                    i += 1
                    counter += 1
                    if i > len(sentence) - 1 or counter > 8:
                        break
                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context += sentence[end:end + counter]
                        break

                token_context = token_context.replace('\n', ' ')
                token_context = regex.sub(r'[^a-zA-Z1-9.!?:%$ ]', '',
                                          token_context)
                token_context = token_context.strip()

                results.append(
                    Index(token=token if token else self.title,
                          date=date_in_text.date,
                          info=token_context))

        return results
Ejemplo n.º 20
0
from typing import List, Callable, Text
from regex import regex

PUNCTUATION_MARKS_REGEX = r'\.,!?:;\"\-—'
assert regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '',
                 PUNCTUATION_MARKS_REGEX.replace('\\', '')) == ''


def add_whitespace_after_punctuation_marks(s: Text) -> Text:
    """
    >>> add_whitespace_after_punctuation_marks('Живи еще хоть четверть века—Всё будет так.Исхода нет.')
    'Живи еще хоть четверть века— Всё будет так. Исхода нет. '
    """
    return regex.sub(r'(\p{L}+' + f'[{PUNCTUATION_MARKS_REGEX}])', r'\g<1> ',
                     s)


def remove_punctuation_marks(s: Text) -> Text:
    """
    >>> remove_punctuation_marks('Good morning, gentlemen! word.word:word;word? "Text"')
    'Good morning gentlemen wordwordwordword Text'
    """
    return regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '', s)


def keep_only_words(s: Text):  # TODO: not working yet
    return regex.sub(rf'\W+|\S+', '', s)


def remove_extra_whitespaces(s: Text) -> Text:
    """
Ejemplo n.º 21
0
 def normalize(in_data):
     """Cleans the filterlist file."""
     re.sub(r"\r", "", in_data)
     re.sub(r"\n+", "\n", in_data)
     return re.sub(checksum_pattern, "", in_data)
Ejemplo n.º 22
0
def keep_only_words(s: Text):  # TODO: not working yet
    return regex.sub(rf'\W+|\S+', '', s)
Ejemplo n.º 23
0
def remove_punctuation_marks(s: Text) -> Text:
    """
    >>> remove_punctuation_marks('Good morning, gentlemen! word.word:word;word? "Text"')
    'Good morning gentlemen wordwordwordword Text'
    """
    return regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '', s)
Ejemplo n.º 24
0
def format_filename(string):
    string = string.lower()
    filename = re.sub(r"[<>:\'\"\/\|?.*]", "", string)
    filename = filename.replace(" ", "_")
    return filename
Ejemplo n.º 25
0
    data_dir = '../data'
    for directory in os.listdir(data_dir):
        if directory.endswith('txt'):
            # print("directory: " + directory)
            yield open(os.path.join(data_dir, directory), encoding='UTF-8').read()



if __name__ == '__main__':
    b = {}
    for year in range(1900, 2500):
        b[str(year)] = {}

    for bill in bills():

        text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill)
        # print(text[:400])

        r = regex.match(r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)', text)
        # r = regex.match(r'\n*(Dz\.U\.z(?P<journal_year>\d+)r\.(N|n)r(?P<journal_number>\d+),?poz.(?P<position>\d+).?)?([a-żA-Ż\d\.\(\)]*\n?){0,4}\n*(ustawa|USTAWA|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\n*z\n?dnia\n?\d{1,2}\n?[a-żA-Ź]*\n?(?P<year>\d{4})\n?r\.\n*(?P<title>(.+\n)*?)\n*?(?P<title2>(.+\n)*?)\n*?(Rozdział(1|I)|Art.\n?(1|l)[^\d]|TYTUŁI|DziałI|częśćogólna)', text)
        # print(title.group())

        position = r.group("position")
        year = r.group("journal_year") or r.group("year")
        b[year][position] = {}
        b[year][position]["counter"] = 0
        b[year][position]["title"] = r.group("title")
        b[year][position]["journal_number"] = r.group("journal_number")
        b[year][position]["journal_year"] = r.group("journal_year")
        b[year][position]["year"] = r.group("year")
        b[year][position]["position"] = position
Ejemplo n.º 26
0
    def parse_text(self):
        results = []
        self.text = regex.sub(r'<ref.*\n?.*</ref>', repl="", string=self.text)
        self.text = regex.sub(r'{\| class=\"wikitable.*\|}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)
        self.text = regex.sub(r'{{[cC]ite.*}}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)

        if self.paragraph_splitter(sep='== See also =='):
            pass
        elif self.paragraph_splitter(sep='==Notes=='):
            pass
        elif self.paragraph_splitter(sep='==References=='):
            pass
        elif self.paragraph_splitter(sep='== Bibliography =='):
            pass
        elif self.paragraph_splitter(sep='== External links =='):
            pass
        elif self.paragraph_splitter(sep='=== Sources ==='):
            pass

        sentences_reg = regex.finditer(
            r'(^| )[A-Z][^\.!?]{5,}[\.!?]',
            self.text)  # possibly [A-Z][^\.!?]{5,}[\.!?] for performance

        for sentence_it in sentences_reg:
            sentence = sentence_it.group(0)
            date_in_text = self.find_date(sentence)
            if date_in_text:
                look_before = 60
                look_after = 30
                start = date_in_text.start - look_before if date_in_text.start >= look_before else 0
                end = date_in_text.end + look_after if date_in_text.end + look_after < len(
                    sentence) else len(sentence)
                # if date_in_text.end + look_after > len(sentence):
                #     token = self.find_token(sentence[start:], date_in_text.start, date_in_text.end)
                # else:
                #     token = self.find_token(sentence[start:date_in_text.end + look_after], date_in_text.start, date_in_text.end)

                token_context = sentence[start:end]

                # token with full word at beginning
                i = start
                counter = 0
                while True:
                    i -= 1
                    counter += 1
                    if i < 0 or counter > 8:
                        break

                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context = sentence[i + 1:start] + token_context
                        break

                # token with full word at end
                i = end
                counter = 0
                while True:
                    i += 1
                    counter += 1
                    if i > len(sentence) - 1 or counter > 8:
                        break
                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context += sentence[end:end + counter]
                        break

                token_context = token_context.replace('\n', ' ')
                token_context = regex.sub(r'[^a-zA-Z0-9.!?:%$;, ]', '',
                                          token_context)
                token_context = token_context.strip()

                results.append(
                    Index(token=self.title,
                          date=date_in_text.date,
                          info=token_context))

                #  I couldnt find best word that explain the purpose, often the result was meaningful, therefore I
                #  decided not to use it.

                # tokenized = nltk.pos_tag(nltk.word_tokenize(sentence))
                #
                # proper_nouns = []
                # nouns = []
                # for (word, pos) in tokenized:
                #     if pos == 'NNP':
                #         proper_nouns.append(word)
                #     elif pos == 'NN':
                #         nouns.append(word)
                #
                # results.append(Index(token=proper_nouns[0] if proper_nouns else "title", date=date_in_text.date, info=proper_nouns[1] if
                # len(proper_nouns) > 1 else nouns[0] if nouns else ""))

        return results
Ejemplo n.º 27
0
    def split_citation_part(string: str):
        """
        A string a tokenizes. Tokens are identified as units or values. Pairs are
        built to connect the units with their respective values. If the unit cannot
        be indentified (and must be inferred later) None is returned.

        Args:
            string: A string that is part of a reference and cites *one* part a statute.

        Retruns: As a generator tuples are returned, each containing the unit (or None)
            and the respecive value.
        """

        # Tokenization

        # fmt: off
        string = regex.sub(
            r"("
            r"\d+(?>\.\d+)?[a-z]?|"
            r"\b[ivx]+|"
            r"\b[a-z]\)?"
            r")"
            r"(\sff?\.|\sff\b)",
            r"\1ff.",
            string,
            flags=regex.IGNORECASE,
        )
        # fmt: on
        tokens = split_unit_number_pattern.split(string, )

        # Building pairs of units with their resp. values

        while len(tokens) > 0:
            token = tokens.pop(0)
            if StatutesParser.is_unit(token):
                if len(tokens) > 0:
                    unit = StatutesParser.stem_unit(token)
                    token = tokens.pop(0)
                    numb = token
                    assert StatutesParser.is_numb(numb), numb
                else:  # when citation ends with unit
                    print(
                        f"Citation {string} ends with unit {token}. Ignoring last unit."
                    )
                    break

            elif StatutesParser.is_pre_numb(token):
                numb = token
                token = tokens.pop(0)
                if not StatutesParser.is_unit(token):
                    print(token, "is not a unit in", string)
                    continue
                    # to fix citation "§ 30 DRITTER ABSCHNITT"
                    # Last part in now ignored,
                    # but reference areas can still be improved.
                unit = StatutesParser.stem_unit(token)

            elif StatutesParser.is_numb(token):
                unit = None
                numb = token
            else:
                raise StringCaseException(token, "in", string)
            numb = regex.sub(r"(ff?\.|ff|\))$", "", numb)
            yield [unit, numb]
Ejemplo n.º 28
0
def remove_extra_whitespaces(s: Text) -> Text:
    """
    >>> remove_extra_whitespaces(' Out   on the tar plains  The glides   are moving ')
    ' Out on the tar plains The glides are moving '
    """
    return regex.sub(r'\s{2,}', ' ', s)
Ejemplo n.º 29
0
 def zeroize(sample):
     return "\n".join([regex.sub(CONLL_LINE, r"\1  \2  O  O", line) for line in sample.split('\n')])
Ejemplo n.º 30
0
 def clean(self, text: str) -> str:
     text = re.sub(r'\s+', ' ', text)
     text = "".join([c for c in text if c in self.allowed_chars])
     text = re.sub(r'\s+', ' ', text)
     return text.strip()