Ejemplos de sub en Python, ejemplos de regex.regex.sub en Python

Ejemplo n.º 1

0

Mostrar archivo

    def sanatize_paragraph(paragraph) -> list:
        """
        Entfernung von zeichen, zahlen, überzähligen Leerzeichen
        :param paragraph: Liste von Strings
        :return: Bereinigte Liste von Strings
        """
        sanatized = list()
        for ind, itm in enumerate(paragraph):
            if itm:
                itm = re.sub(r"\.", " ", itm)  # Replace dots with whitespaces
                itm = re.sub(
                    r"(?=[^ ])\P{L}", " ",
                    itm)  # Replace all non-word chars with whitespaces
                itm = re.sub(r" {2,}", " ", itm)  # Strip excess whitespaces

                if not itm or len(itm.strip()) <= 3:
                    continue
                sanatized.append(itm.strip().lower())

            if not itm and ind == 0:
                return list()
        if len(sanatized) == 1:
            return list()

        return sanatized

Ejemplo n.º 2

0

Mostrar archivo

Archivo: transcript.py Proyecto: rxhmdia/Speech_Recognition_Macedonian

def format_transcript(path):
    """
    This function is for converting the transcript text to all uppercase letters and removing all special characters (except for whitespace and newline characters), as per agreed upon convention.

    Parameters:
        path (string): String variable containing the path to the transcript .txt file

    Returns:
        Creates a new .txt file containing the formatted contents of the original

    """

    src = open(path, mode='r', encoding='utf-8')
    dst = open(path[:-4] + '-formatted.txt', mode='w', encoding='utf-8')

    transcript = src.read()

    transcript_array = [
        element.split(' ', 1) for element in transcript.strip().split('\n')
    ]

    if is_indexed(transcript_array):
        transcript = re.sub('[^\\p{L} \n\\d-]', '', transcript)
    else:
        transcript = re.sub('[^\\p{L} \n]', '', transcript)

    transcript = transcript.upper()

    dst.write(transcript)

    src.close()
    dst.close()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: PathAnt.py Proyecto: c0ntradicti0n/LayoutEagle

    def __init__(self, necessary_paths={config.hidden_folder: ["tex_data", "cache", "log", "topics"]}):
        make_dirs_recursive(necessary_paths)
        self.G = nx.MultiDiGraph()

        for _from, _to, functional_object in ____CONVERTERS____:
            self.add_edge(_from, _to, functional_object)
            self.add_starred(_from, _to, functional_object, ____CONVERTERS____)
            functional_object.ant = self

        for (_froms1, _tos1, functional_object1), \
            (_froms2, _tos2, functional_object2) \
                in itertools.permutations(____CONVERTERS____, 2):

            for (_to1, _from1, _to2, _from2) in list_or_values(_tos1, _froms1, _tos2, _froms2):
                if _from1 == None:
                    _from1 =  OUT_OF_THE_BOX
                if _from2 == None:
                    _from2 =  OUT_OF_THE_BOX

                try:
                    if match(_to1, _from2):
                        self.add_edge(_to1, regex.sub(_from2 + '$', _to2, _to1), functional_object2)
                    if match(_to2, _from1):
                        self.add_edge(_to2, regex.sub(_from1 + '$', _to1, _to2), functional_object1)
                except Exception as e:
                    logging.error(f"_to1 = {_to1}")
                    logging.error(
                        f"failing to compare {_to1} and {_to2} and {_from1} and {_from2} as regexes because {e}")

Ejemplo n.º 4

0

Mostrar archivo

Archivo: statutes_parse.py Proyecto: QuantLaw/quantlaw

 def fix_errors_in_citation(citation):
     """
     Fix some common inconsistencies in the references such as double spaces.
     """
     result = regex.sub(r"\s+", " ", citation)
     result = regex.sub(r"§(?=\d)", "§ ", result)
     result = regex.sub(r",\sbis\s", " bis ", result)
     return result

Ejemplo n.º 5

0

Mostrar archivo

Archivo: hw4.py Proyecto: sm-richards/trek-ner

 def extract(
     self,
     token: str,
     current_idx: int,
     relative_idx: int,
     tokens: Sequence[str],
     features: Dict[str, float],
 ):
     shape = regex.sub(
         UPPERCASE_RE, 'X',
         regex.sub(LOWERCASE_RE, 'x', re.sub(DIGIT_RE, '0', token)))
     features["shape[" + str(relative_idx) + "]=" + shape] = 1.0

Ejemplo n.º 6

0

Mostrar archivo

def remove_common_sub(domains):
    """
    Remove www. and m. subdomains
    """
    pattern = re.compile(r"^(?>www\.|m\.)")
    domains = [re.sub(pattern, "", x, concurrent=True) for x in domains]
    return set(domains)

Ejemplo n.º 7

0

Mostrar archivo

def add_whitespace_after_punctuation_marks(s: Text) -> Text:
    """
    >>> add_whitespace_after_punctuation_marks('Живи еще хоть четверть века—Всё будет так.Исхода нет.')
    'Живи еще хоть четверть века— Всё будет так. Исхода нет. '
    """
    return regex.sub(r'(\p{L}+' + f'[{PUNCTUATION_MARKS_REGEX}])', r'\g<1> ',
                     s)

Ejemplo n.º 8

0

Mostrar archivo

def delete_quote_links(text: str, tweet):
    if is_quote(tweet):
        text = regex.sub(get_tweet_url(tweet.quoted_status),
                         '',
                         text,
                         flags=regex.IGNORECASE)

    return text

Ejemplo n.º 9

0

Mostrar archivo

Archivo: Anime.py Proyecto: GhassenAB/scraping_animes

 def tojson(self):
     return {
         'title': regex.sub(' +', ' ', self.title.strip()),
         'link': self.link,
         'cover': self.imageUrl,
         'details': self.details,
         'screens': self.screens,
         'links': self.links
     }

Ejemplo n.º 10

0

Mostrar archivo

def extract_abp(content):
    """Extracts blocked and unblocked domains from ABP style content."""
    pattern_unsupported = re.compile(r"\S+(?>\/|\=)\S+", re.V1)
    pattern_supported_block = re.compile(
        r"^\|\|.+\^(?>$|.+(?:"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|"
        r"\bdocument\b|"
        r"\ball\b"
        # r"\ball\b|"
        # r"\bpopup\b"
        r"))",
        re.V1,
    )
    pattern_scrub_blocked_list = [
        r"^\|\|",
        r"\^($|.+(?>"
        r"\bfirst-party\b|"
        r"\b1p\b|"
        r"\bthird-party\b|"
        r"\b3p\b|\bdocument\b|"
        r"\ball\b|"
        r"\bpopup\b|"
        r"\S+))",
    ]
    pattern_scrub_blocked = re.compile(
        "|".join(f"(?:{p})" for p in pattern_scrub_blocked_list), re.V1
    )
    block_rules = [
        x
        for x in content
        if re.match(pattern_supported_block, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]

    blocked_domains = [
        re.sub(pattern_scrub_blocked, "", x, concurrent=True) for x in block_rules
    ]
    blocked_domains = [x for x in blocked_domains if valid_domain(x)]
    pattern_supported_unblock = re.compile(r"@@\|\|.+\^$")
    unblock_rules = [
        x
        for x in content
        if re.match(pattern_supported_unblock, x, concurrent=True)
        and not re.match(pattern_unsupported, x, concurrent=True)
    ]
    unblocked_domains = [
        x.replace("@@||", "").replace("^", "").replace("$important", "")
        for x in unblock_rules
    ]
    regex_rules = []
    return blocked_domains, unblocked_domains, unblock_rules, regex_rules

Ejemplo n.º 11

0

Mostrar archivo

Archivo: generator.py Proyecto: arapurayil/aBL

def concat_category(out_file):
    """Concatenate category README.md files"""
    files = glob(f"{DirPath.input}/*/*.md")
    files = sorted(files, key=lambda x: x)
    files = sorted(files, key=lambda x: x.__contains__("regional"))
    files = sorted(files, key=lambda x: x.__contains__("main"), reverse=True)
    for file in files:
        with open(file, encoding="utf-8") as file_input:
            with open(out_file, "a", encoding="utf-8") as file_output:
                lines = (re.sub(r"^#", r"##", x)
                         if re.match(r"^#{0,6}+\s", x) else x
                         for x in file_input)
                file_output.writelines(lines)

Ejemplo n.º 12

0

Mostrar archivo

def fix_escape_characters(text: str):
    text = text.replace('&amp;', '\&')
    text = text.replace('&lt;', '\<')
    text = text.replace('&gt;', '\>')

    # Escape Discord's markdown
    text = text.replace('`', '\`')
    text = text.replace('*', '\*')
    text = text.replace('~', '\~')

    # Special exception for underscore because Twitter user names may contain them
    text = regex.sub(r'(?<!@\S*)_', '\_', text)

    return text

Ejemplo n.º 13

0

Mostrar archivo

    def _parse_infobox(self, text, title):
        result = []
        text = regex.sub(r'\n ?\|', '\n|', text)
        lines = text.split('\n|')
        for line in lines:
            date_in_text = self.find_date(line)
            if date_in_text:
                info = [x.strip() for x in line.split('=')]
                result.append(
                    Index(token=title,
                          date=date_in_text.date,
                          info=info[0].replace('\n', '')))

        return result

Ejemplo n.º 14

0

Mostrar archivo

Archivo: editable_label.py Proyecto: c0ntradicti0n/CorpusCookApp

 def on_edit(self, instance, value):
     if not value:
         if self.textinput:
             self.remove_widget(self.textinput)
         return
     unformatted_text = regex.sub(self.unformat_bbcode, "", self.text)
     self.textinput = t = SelectableLabel(text=unformatted_text,
                                          size_hint=(None, None),
                                          font_size=self.font_size,
                                          font_name=self.font_name,
                                          pos=self.pos,
                                          size=self.size,
                                          multiline=False)
     self.bind(pos=t.setter('pos'), size=t.setter('size'))
     self.add_widget(self.textinput)
     t.bind(on_text_validate=self.on_text_validate,
            focus=self.on_text_focus)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: load_data.py Proyecto: piotrgajdzica/nlp

def bills():
    data_dir = '../../lab1/data'
    for directory in os.listdir(data_dir):
        if directory.endswith('txt'):
            # print("directory: " + directory)

            bill = open(os.path.join(data_dir, directory), encoding='UTF-8').read()
            text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill)
            # print(text[:400])

            r = regex.match(
                r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)',
                text)

            if r is None:
                yield bill, "", "", "", "f"
            else:
                yield bill, r.group("title"), r.group("journal_year"), r.group("position"), directory.split('.')[0]

Ejemplo n.º 16

0

Mostrar archivo

Archivo: generator.py Proyecto: arapurayil/aBL

def extract_hosts(content, list_type):
    """Extracts blocked or unblocked domains from hosts/domains style content."""
    pattern_scrub = [
        r"(?>\#|\!|\s+\#|\s+\!).*",
        r"^\s",
        r".*\blocalhost\b.*",
        r"^\d*\.\d*\.\d*\.\d*\s*(?>\s|www\.|m\.)",
        r"^(?>www\.|m\.)",
    ]
    pattern = re.compile("|".join(f"(?:{p})" for p in pattern_scrub), re.V1)
    domains = [re.sub(pattern, "", x, concurrent=True) for x in content]
    domains = [x for x in domains if valid_domain(x)]
    blocked_domains, unblocked_domains = [], []
    if list_type == "unblock":
        unblocked_domains = domains
    if list_type == "block":
        blocked_domains = domains

    return blocked_domains, unblocked_domains

Ejemplo n.º 17

0

Mostrar archivo

def replace_hashtag_with_link(text: str, hashtag_entities=None):
    if hashtag_entities is not None:
        hashtags_sorted = sorted(hashtag_entities,
                                 key=lambda x: x['indices'][0],
                                 reverse=True)

        for hashtag in hashtags_sorted:
            start, end = hashtag['indices']

            # text[start] is either '#' or '＃', so this preserves the original character used
            hashtag_text = text[start] + hashtag['text']
            text = text[0:start] + get_named_link(
                hashtag_text, get_hashtag_url(hashtag_text)) + text[end:]
    else:
        hashtags = regex.findall(r'(?:[#|＃])[^\d\W][\w]*', text)
        for hashtag in hashtags:
            text = regex.sub(
                regex.escape(hashtag),
                fr'{get_named_link(hashtag, get_hashtag_url(hashtag))}', text)

    return text

Ejemplo n.º 18

0

Mostrar archivo

def replace_mention_with_link(text: str,
                              user_mentions_entities,
                              in_reply_to_screen_name: str = None):
    if not user_mentions_entities:
        return text

    for mention in user_mentions_entities:
        mention_text = '@' + mention['screen_name']

        if in_reply_to_screen_name and mention[
                'screen_name'] == in_reply_to_screen_name:
            text = regex.sub(regex.escape(mention_text),
                             '',
                             text,
                             flags=regex.IGNORECASE)
        else:
            text = text.replace(
                mention_text,
                get_named_link(
                    mention_text,
                    get_profile_url(screen_name=mention['screen_name'])))

    return text

Ejemplo n.º 19

0

Mostrar archivo

    def parse_text(self):
        results = []
        self.text = regex.sub(r'&lt;ref.*\n?.*&lt;/ref&gt;',
                              repl="",
                              string=self.text)
        self.text = regex.sub(r'{\| class=\"wikitable.*\|}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)
        self.text = regex.sub(r'{{[cC]ite.*}}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)

        if self.paragraph_splitter(sep='== See also =='):
            pass
        elif self.paragraph_splitter(sep='==Notes=='):
            pass
        elif self.paragraph_splitter(sep='==References=='):
            pass
        elif self.paragraph_splitter(sep='== Bibliography =='):
            pass
        elif self.paragraph_splitter(sep='== External links =='):
            pass
        elif self.paragraph_splitter(sep='=== Sources ==='):
            pass

        sentences_reg = regex.finditer(
            r'(^| )[A-Z][^\.!?]{5,}[\.!?]',
            self.text)  # possibly [A-Z][^\.!?]{5,}[\.!?] for performance

        for sentence_it in sentences_reg:
            sentence = sentence_it.group(0)
            date_in_text = self.find_date(sentence)
            if date_in_text:
                look_before = 60
                look_after = 30
                start = date_in_text.start - look_before if date_in_text.start >= look_before else 0
                end = date_in_text.end + look_after if date_in_text.end + look_after < len(
                    sentence) else len(sentence)
                if date_in_text.end + look_after > len(sentence):
                    token = self.find_token(sentence[start:],
                                            date_in_text.start,
                                            date_in_text.end)
                else:
                    token = self.find_token(
                        sentence[start:date_in_text.end + look_after],
                        date_in_text.start, date_in_text.end)

                token_context = sentence[start:end]

                # token with full word at beginning
                i = start
                counter = 0
                while True:
                    i -= 1
                    counter += 1
                    if i < 0 or counter > 8:
                        break

                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context = sentence[i + 1:start] + token_context
                        break

                # token with full word at end
                i = end
                counter = 0
                while True:
                    i += 1
                    counter += 1
                    if i > len(sentence) - 1 or counter > 8:
                        break
                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context += sentence[end:end + counter]
                        break

                token_context = token_context.replace('\n', ' ')
                token_context = regex.sub(r'[^a-zA-Z1-9.!?:%$ ]', '',
                                          token_context)
                token_context = token_context.strip()

                results.append(
                    Index(token=token if token else self.title,
                          date=date_in_text.date,
                          info=token_context))

        return results

Ejemplo n.º 20

0

Mostrar archivo

from typing import List, Callable, Text
from regex import regex

PUNCTUATION_MARKS_REGEX = r'\.,!?:;\"\-—'
assert regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '',
                 PUNCTUATION_MARKS_REGEX.replace('\\', '')) == ''


def add_whitespace_after_punctuation_marks(s: Text) -> Text:
    """
    >>> add_whitespace_after_punctuation_marks('Живи еще хоть четверть века—Всё будет так.Исхода нет.')
    'Живи еще хоть четверть века— Всё будет так. Исхода нет. '
    """
    return regex.sub(r'(\p{L}+' + f'[{PUNCTUATION_MARKS_REGEX}])', r'\g<1> ',
                     s)


def remove_punctuation_marks(s: Text) -> Text:
    """
    >>> remove_punctuation_marks('Good morning, gentlemen! word.word:word;word? "Text"')
    'Good morning gentlemen wordwordwordword Text'
    """
    return regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '', s)


def keep_only_words(s: Text):  # TODO: not working yet
    return regex.sub(rf'\W+|\S+', '', s)


def remove_extra_whitespaces(s: Text) -> Text:
    """

Ejemplo n.º 21

0

Mostrar archivo

 def normalize(in_data):
     """Cleans the filterlist file."""
     re.sub(r"\r", "", in_data)
     re.sub(r"\n+", "\n", in_data)
     return re.sub(checksum_pattern, "", in_data)

Ejemplo n.º 22

0

Mostrar archivo

def keep_only_words(s: Text):  # TODO: not working yet
    return regex.sub(rf'\W+|\S+', '', s)

Ejemplo n.º 23

0

Mostrar archivo

def remove_punctuation_marks(s: Text) -> Text:
    """
    >>> remove_punctuation_marks('Good morning, gentlemen! word.word:word;word? "Text"')
    'Good morning gentlemen wordwordwordword Text'
    """
    return regex.sub(rf'[{PUNCTUATION_MARKS_REGEX}]', '', s)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: utils.py Proyecto: joshuanazareth97/popviz

def format_filename(string):
    string = string.lower()
    filename = re.sub(r"[<>:\'\"\/\|?.*]", "", string)
    filename = filename.replace(" ", "_")
    return filename

Ejemplo n.º 25

0

Mostrar archivo

    data_dir = '../data'
    for directory in os.listdir(data_dir):
        if directory.endswith('txt'):
            # print("directory: " + directory)
            yield open(os.path.join(data_dir, directory), encoding='UTF-8').read()



if __name__ == '__main__':
    b = {}
    for year in range(1900, 2500):
        b[str(year)] = {}

    for bill in bills():

        text = regex.sub(r"[ \t\r\f\v ][ \t\r\f\v ]+", "", bill)
        # print(text[:400])

        r = regex.match(r'\s*(Dz\.U\.\s*z\s*(?P<journal_year>\d+)\s*r\.\s*(N|n)r\s*(?P<journal_number>\d+),?\s*?poz.\s*(?P<position>\d+).?\s*)?([a-żA-Ż \d\.\(\)]*\s?){0,4}\s*(ustawa|USTAWA|U S T A W A|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\s*z\s*dnia\s*\d{1,2}\s*[a-żA-Ź]*\s*(?P<year>\d{4})\s*r\.\s*(?P<title>[\s\S]*?)\n\s*(Rozdział\s*(1|I)|Art.\s*(1|l)[^\d]|TYTUŁ\s*I|Dział\s*I|część\s*ogólna)', text)
        # r = regex.match(r'\n*(Dz\.U\.z(?P<journal_year>\d+)r\.(N|n)r(?P<journal_number>\d+),?poz.(?P<position>\d+).?)?([a-żA-Ż\d\.\(\)]*\n?){0,4}\n*(ustawa|USTAWA|Ustawa|ustawA|USTAWa)[^\n]*\n[^\n]*\n*z\n?dnia\n?\d{1,2}\n?[a-żA-Ź]*\n?(?P<year>\d{4})\n?r\.\n*(?P<title>(.+\n)*?)\n*?(?P<title2>(.+\n)*?)\n*?(Rozdział(1|I)|Art.\n?(1|l)[^\d]|TYTUŁI|DziałI|częśćogólna)', text)
        # print(title.group())

        position = r.group("position")
        year = r.group("journal_year") or r.group("year")
        b[year][position] = {}
        b[year][position]["counter"] = 0
        b[year][position]["title"] = r.group("title")
        b[year][position]["journal_number"] = r.group("journal_number")
        b[year][position]["journal_year"] = r.group("journal_year")
        b[year][position]["year"] = r.group("year")
        b[year][position]["position"] = position

Ejemplo n.º 26

0

Mostrar archivo

Archivo: main.py Proyecto: Janci144/VINF-calendar-wiki

    def parse_text(self):
        results = []
        self.text = regex.sub(r'<ref.*\n?.*</ref>', repl="", string=self.text)
        self.text = regex.sub(r'{\| class=\"wikitable.*\|}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)
        self.text = regex.sub(r'{{[cC]ite.*}}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)

        if self.paragraph_splitter(sep='== See also =='):
            pass
        elif self.paragraph_splitter(sep='==Notes=='):
            pass
        elif self.paragraph_splitter(sep='==References=='):
            pass
        elif self.paragraph_splitter(sep='== Bibliography =='):
            pass
        elif self.paragraph_splitter(sep='== External links =='):
            pass
        elif self.paragraph_splitter(sep='=== Sources ==='):
            pass

        sentences_reg = regex.finditer(
            r'(^| )[A-Z][^\.!?]{5,}[\.!?]',
            self.text)  # possibly [A-Z][^\.!?]{5,}[\.!?] for performance

        for sentence_it in sentences_reg:
            sentence = sentence_it.group(0)
            date_in_text = self.find_date(sentence)
            if date_in_text:
                look_before = 60
                look_after = 30
                start = date_in_text.start - look_before if date_in_text.start >= look_before else 0
                end = date_in_text.end + look_after if date_in_text.end + look_after < len(
                    sentence) else len(sentence)
                # if date_in_text.end + look_after > len(sentence):
                #     token = self.find_token(sentence[start:], date_in_text.start, date_in_text.end)
                # else:
                #     token = self.find_token(sentence[start:date_in_text.end + look_after], date_in_text.start, date_in_text.end)

                token_context = sentence[start:end]

                # token with full word at beginning
                i = start
                counter = 0
                while True:
                    i -= 1
                    counter += 1
                    if i < 0 or counter > 8:
                        break

                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context = sentence[i + 1:start] + token_context
                        break

                # token with full word at end
                i = end
                counter = 0
                while True:
                    i += 1
                    counter += 1
                    if i > len(sentence) - 1 or counter > 8:
                        break
                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context += sentence[end:end + counter]
                        break

                token_context = token_context.replace('\n', ' ')
                token_context = regex.sub(r'[^a-zA-Z0-9.!?:%$;, ]', '',
                                          token_context)
                token_context = token_context.strip()

                results.append(
                    Index(token=self.title,
                          date=date_in_text.date,
                          info=token_context))

                #  I couldnt find best word that explain the purpose, often the result was meaningful, therefore I
                #  decided not to use it.

                # tokenized = nltk.pos_tag(nltk.word_tokenize(sentence))
                #
                # proper_nouns = []
                # nouns = []
                # for (word, pos) in tokenized:
                #     if pos == 'NNP':
                #         proper_nouns.append(word)
                #     elif pos == 'NN':
                #         nouns.append(word)
                #
                # results.append(Index(token=proper_nouns[0] if proper_nouns else "title", date=date_in_text.date, info=proper_nouns[1] if
                # len(proper_nouns) > 1 else nouns[0] if nouns else ""))

        return results

Ejemplo n.º 27

0

Mostrar archivo

Archivo: statutes_parse.py Proyecto: QuantLaw/quantlaw

    def split_citation_part(string: str):
        """
        A string a tokenizes. Tokens are identified as units or values. Pairs are
        built to connect the units with their respective values. If the unit cannot
        be indentified (and must be inferred later) None is returned.

        Args:
            string: A string that is part of a reference and cites *one* part a statute.

        Retruns: As a generator tuples are returned, each containing the unit (or None)
            and the respecive value.
        """

        # Tokenization

        # fmt: off
        string = regex.sub(
            r"("
            r"\d+(?>\.\d+)?[a-z]?|"
            r"\b[ivx]+|"
            r"\b[a-z]\)?"
            r")"
            r"(\sff?\.|\sff\b)",
            r"\1ff.",
            string,
            flags=regex.IGNORECASE,
        )
        # fmt: on
        tokens = split_unit_number_pattern.split(string, )

        # Building pairs of units with their resp. values

        while len(tokens) > 0:
            token = tokens.pop(0)
            if StatutesParser.is_unit(token):
                if len(tokens) > 0:
                    unit = StatutesParser.stem_unit(token)
                    token = tokens.pop(0)
                    numb = token
                    assert StatutesParser.is_numb(numb), numb
                else:  # when citation ends with unit
                    print(
                        f"Citation {string} ends with unit {token}. Ignoring last unit."
                    )
                    break

            elif StatutesParser.is_pre_numb(token):
                numb = token
                token = tokens.pop(0)
                if not StatutesParser.is_unit(token):
                    print(token, "is not a unit in", string)
                    continue
                    # to fix citation "§ 30 DRITTER ABSCHNITT"
                    # Last part in now ignored,
                    # but reference areas can still be improved.
                unit = StatutesParser.stem_unit(token)

            elif StatutesParser.is_numb(token):
                unit = None
                numb = token
            else:
                raise StringCaseException(token, "in", string)
            numb = regex.sub(r"(ff?\.|ff|\))$", "", numb)
            yield [unit, numb]

Ejemplo n.º 28

0

Mostrar archivo

def remove_extra_whitespaces(s: Text) -> Text:
    """
    >>> remove_extra_whitespaces(' Out   on the tar plains  The glides   are moving ')
    ' Out on the tar plains The glides are moving '
    """
    return regex.sub(r'\s{2,}', ' ', s)

Ejemplo n.º 29

0

Mostrar archivo

 def zeroize(sample):
     return "\n".join([regex.sub(CONLL_LINE, r"\1  \2  O  O", line) for line in sample.split('\n')])

Ejemplo n.º 30

0

Mostrar archivo

 def clean(self, text: str) -> str:
     text = re.sub(r'\s+', ' ', text)
     text = "".join([c for c in text if c in self.allowed_chars])
     text = re.sub(r'\s+', ' ', text)
     return text.strip()