Esempio n. 1
0
    def __init__(self, is_tuple=False):
        """Create a SentenceSplitter object. If the tokenized paragraphs
        contain token classes or extra info, set is_tuple=True.

        """
        self.is_tuple = is_tuple
        # full stop, ellipsis, exclamation and question marks
        self.sentence_ending_punct = re.compile(r"^(?:\.+|…+\.*|[!?]+)$")
        self.opening_punct = re.compile(r"^(?:['\"¿¡\p{Pi}\p{Ps}–—]|-{2,})$")
        self.closing_punct = re.compile(r"^(?:['\"“\p{Pf}\p{Pe}])$")
        self.eos_abbreviations = utils.read_abbreviation_file(
            "eos_abbreviations.txt")
Esempio n. 2
0
    def __init__(self, split_camel_case=False, token_classes=False, extra_info=False, language="de"):
        """Create a Tokenizer object. If split_camel_case is set to True,
        tokens written in CamelCase will be split. If token_classes is
        set to true, the tokenizer will output the token class for
        each token (if it is a number, an XML tag, an abbreviation,
        etc.). If extra_info is set to True, the tokenizer will output
        information about the original spelling of the tokens.

        """
        self.split_camel_case = split_camel_case
        self.token_classes = token_classes
        self.extra_info = extra_info
        self.language = language if language in self.supported_languages else self.default_language
        self.unique_string_length = 7
        self.mapping = {}
        self.unique_prefix = None
        self.replacement_counter = 0

        self.spaces = re.compile(r"\s+")
        self.controls = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
        self.stranded_variation_selector = re.compile(r" \uFE0F")
        # soft hyphen (00AD), zero-width space (200B), zero-width
        # non-joiner (200C), zero-width joiner (200D), Arabic letter
        # mark (061C), left-to-right mark (200E), right-to-left mark
        # (200F), word joiner (2060), left-to-right isolate (2066),
        # right-to-left isolate (2067), first strong isolate (2068),
        # pop directional isolate (2069), l-t-r/r-t-l embedding (202A,
        # 202B), l-t-r/r-t-l override (202D, 202E), pop directional
        # formatting (202C), zero-width no-break space (FEFF)
        self.other_nasties = re.compile(r"[\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]")
        # combination
        self.starts_with_junk = re.compile(r"^[\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]+")
        self.junk_next_to_space = re.compile(r"(?:^|\s)[\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]+|[\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]+(?:\s|$)")
        self.junk_between_spaces = re.compile(r"(?:^|\s+)[\s\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]+(?:\s+|$)")

        # My Additions
        self.letter_hyphen = re.compile(r'\b\p{Lu}-\p{L}{3,}\b')

        # TAGS, EMAILS, URLs
        self.xml_declaration = re.compile(r"""<\?xml
                                              (?:                #   This group permits zero or more attributes
                                                \s+              #   Whitespace to separate attributes
                                                [_:A-Z][-.:\w]*  #   Attribute name
                                                \s*=\s*          #   Attribute name-value delimiter
                                                (?: "[^"]*"      #   Double-quoted attribute value
                                                  | '[^']*'      #   Single-quoted attribute value
                                                )
                                              )*
                                              \s*                #   Permit trailing whitespace
                                              \?>""", re.VERBOSE | re.IGNORECASE)
        # self.tag = re.compile(r'<(?!-)(?:/[^> ]+|[^>]+/?)(?<!-)>')
        # taken from Regular Expressions Cookbook
        self.tag = re.compile(r"""
                                  <
                                  (?:                  # Branch for opening tags:
                                    ([_:A-Z][-.:\w]*)  #   Capture the opening tag name to backreference 1
                                    (?:                #   This group permits zero or more attributes
                                      \s+              #   Whitespace to separate attributes
                                      [_:A-Z][-.:\w]*  #   Attribute name
                                      \s*=\s*          #   Attribute name-value delimiter
                                      (?: "[^"]*"      #   Double-quoted attribute value
                                        | '[^']*'      #   Single-quoted attribute value
                                      )
                                    )*
                                    \s*                #   Permit trailing whitespace
                                    /?                 #   Permit self-closed tags
                                  |                    # Branch for closing tags:
                                    /
                                    ([_:A-Z][-.:\w]*)  #   Capture the closing tag name to backreference 2
                                    \s*                #   Permit trailing whitespace
                                  )
                                  >
        """, re.VERBOSE | re.IGNORECASE)
        # regex for email addresses taken from:
        # http://www.regular-expressions.info/email.html
        # self.email = re.compile(r"\b[\w.%+-]+@[\w.-]+\.\p{L}{2,}\b")
        self.email = re.compile(r"\b[\w.%+-]+(?:@| \[at\] )[\w.-]+(?:\.| \[?dot\]? )\p{L}{2,}\b")
        # simple regex for urls that start with http or www
        # TODO: schließende Klammer am Ende erlauben, wenn nach http etc. eine öffnende kam
        self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)\S+?\(\S*?\)\S*(?=$|[\'. "!?,;\n\t])', re.IGNORECASE)
        self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)\S+[^\'. "!?,;:\n\t)]', re.IGNORECASE)
        self.doi = re.compile(r'\bdoi:10\.\d+/\S+', re.IGNORECASE)
        self.doi_with_space = re.compile(r'(?<=\bdoi: )10\.\d+/\S+', re.IGNORECASE)
        # we also allow things like tagesschau.de-App
        self.url_without_protocol = re.compile(r'\b[\w./-]+\.(?:de|com|org|net|edu|info|gov|jpg|png|gif|log|txt|xlsx?|docx?|pptx?|pdf)(?:-\w+)?\b', re.IGNORECASE)
        self.reddit_links = re.compile(r'(?<!\w)/?[rlu](?:/\w+)+/?(?!\w)', re.IGNORECASE)

        # XML entities
        self.entity_name = re.compile(r'&(?:quot|amp|apos|lt|gt);', re.IGNORECASE)
        self.entity_decimal = re.compile(r'&#\d+;')
        self.entity_hex = re.compile(r'&#x[0-9a-f]+;', re.IGNORECASE)

        # EMOTICONS
        emoticon_set = set(["(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:",
                            "(-:", ")=", ")o:", ")x", ":'C", ":/",
                            ":<", ":C", ":[", "=(", "=)", "=D", "=P",
                            ">:", "\\:", "]:", "x(", "^^", "o.O",
                            "\\O/", "\\m/", ":;))", "_))", "*_*", "._.",
                            ":wink:", ">_<", "*<:-)", ":!:", ":;-))"])
        emoticon_list = sorted(emoticon_set, key=len, reverse=True)
        self.emoticon = re.compile(r"""(?:(?:[:;]|(?<!\d)8)           # a variety of eyes, alt.: [:;8]
                                        [-'oO]?                       # optional nose or tear
                                        (?: \)+ | \(+ | [*] | ([DPp])\1*(?!\w)))   # a variety of mouths
                                    """ +
                                   r"|" +
                                   r"(?:\b[Xx]D+\b)" +
                                   r"|" +
                                   r"(?:\b(?:D'?:|oO)\b)" +
                                   r"|" +
                                   r"|".join([re.escape(_) for _ in emoticon_list]), re.VERBOSE)
        self.space_emoticon = re.compile(r'([:;])[ ]+([()])(?! *[\+0])')
        # ^3 is an emoticon, unless it is preceded by a number (with
        # optional whitespace between number and ^3)
        # ^\^3    # beginning of line, no leading characters
        # ^\D^3   # beginning of line, one leading character
        # (?<=\D[ ])^3   # two leading characters, non-number + space
        # (?<=.[^\d ])^3   # two leading characters, x + non-space-non-number
        self.heart_emoticon = re.compile(r"(?:^|^\D|(?<=\D[ ])|(?<=.[^\d ]))\^3")
        # U+2600..U+26FF	Miscellaneous Symbols
        # U+2700..U+27BF	Dingbats
        # U+FE0E..U+FE0F        text and emoji variation selectors
        # U+1F300..U+1F5FF	Miscellaneous Symbols and Pictographs
        # -> U+1F3FB..U+1F3FF   Emoji modifiers (skin tones)
        # U+1F600..U+1F64F	Emoticons
        # U+1F680..U+1F6FF	Transport and Map Symbols
        # U+1F900..U+1F9FF	Supplemental Symbols and Pictographs
        # self.unicode_symbols = re.compile(r"[\u2600-\u27BF\uFE0E\uFE0F\U0001F300-\U0001f64f\U0001F680-\U0001F6FF\U0001F900-\U0001F9FF]")
        self.unicode_flags = re.compile(r"\p{Regional_Indicator}{2}\uFE0F?")

        # special tokens containing + or &
        tokens_with_plus_or_ampersand = utils.read_abbreviation_file("tokens_with_plus_or_ampersand.txt")
        plus_amp_simple = [(pa, re.search(r"^\w+[&+]\w+$", pa)) for pa in tokens_with_plus_or_ampersand]
        self.simple_plus_ampersand = set([pa[0].lower() for pa in plus_amp_simple if pa[1]])
        self.simple_plus_ampersand_candidates = re.compile(r"\b\w+[&+]\w+\b")
        tokens_with_plus_or_ampersand = [pa[0] for pa in plus_amp_simple if not pa[1]]
        # self.token_with_plus_ampersand = re.compile(r"(?<!\w)(?:\L<patokens>)(?!\w)", re.IGNORECASE, patokens=tokens_with_plus_or_ampersand)
        self.token_with_plus_ampersand = re.compile(r"(?<!\w)(?:" + r"|".join([re.escape(_) for _ in tokens_with_plus_or_ampersand]) + r")(?!\w)", re.IGNORECASE)

        # camelCase
        self.emoji = re.compile(r'\bemojiQ\p{L}{3,}\b')
        camel_case_token_list = utils.read_abbreviation_file("camel_case_tokens.txt")
        cc_alnum = [(cc, re.search(r"^\w+$", cc)) for cc in camel_case_token_list]
        self.simple_camel_case_tokens = set([cc[0] for cc in cc_alnum if cc[1]])
        self.simple_camel_case_candidates = re.compile(r"\b\w*\p{Ll}\p{Lu}\w*\b")
        camel_case_token_list = [cc[0] for cc in cc_alnum if not cc[1]]
        # things like ImmobilienScout24.de are already covered by URL detection
        # self.camel_case_url = re.compile(r'\b(?:\p{Lu}[\p{Ll}\d]+){2,}\.(?:de|com|org|net|edu)\b')
        self.camel_case_token = re.compile(r"\b(?:" + r"|".join([re.escape(_) for _ in camel_case_token_list]) + r"|:Mac\p{Lu}\p{Ll}*)\b")
        # self.camel_case_token = re.compile(r"\b(?:\L<cctokens>|Mac\p{Lu}\p{Ll}*)\b", cctokens=camel_case_token_set)
        self.in_and_innen = re.compile(r'\b\p{L}+\p{Ll}In(?:nen)?\p{Ll}*\b')
        self.camel_case = re.compile(r'(?<=\p{Ll}{2})(\p{Lu})(?!\p{Lu}|\b)')

        # GENDER STAR
        self.gender_star = re.compile(r'\b\p{L}+\*in(?:nen)?\p{Ll}*\b', re.IGNORECASE)

        # ABBREVIATIONS
        self.single_letter_ellipsis = re.compile(r"(?<![\w.])(?P<a_letter>\p{L})(?P<b_ellipsis>\.{3})(?!\.)")
        self.and_cetera = re.compile(r"(?<![\w.&])&c\.(?!\p{L}{1,3}\.)")
        self.str_abbreviations = re.compile(r'(?<![\w.])([\p{L}-]+-Str\.)(?!\p{L})', re.IGNORECASE)
        self.nr_abbreviations = re.compile(r"(?<![\w.])(\w+\.-?Nr\.)(?!\p{L}{1,3}\.)", re.IGNORECASE)
        self.single_letter_abbreviation = re.compile(r"(?<![\w.])\p{L}\.(?!\p{L}{1,3}\.)")
        # abbreviations with multiple dots that constitute tokens
        single_token_abbreviation_list = utils.read_abbreviation_file("single_token_abbreviations_%s.txt" % self.language)
        self.single_token_abbreviation = re.compile(r"(?<![\w.])(?:" + r'|'.join([re.escape(_) for _ in single_token_abbreviation_list]) + r')(?!\p{L}{1,3}\.)', re.IGNORECASE)
        self.ps = re.compile(r"(?<!\d[ ])\bps\.", re.IGNORECASE)
        self.multipart_abbreviation = re.compile(r'(?:\p{L}+\.){2,}')
        # only abbreviations that are not matched by (?:\p{L}\.)+
        abbreviation_list = utils.read_abbreviation_file("abbreviations_%s.txt" % self.language)
        # abbrev_simple = [(a, re.search(r"^\p{L}{2,}\.$", a)) for a in abbreviation_list]
        # self.simple_abbreviations = set([a[0].lower() for a in abbrev_simple if a[1]])
        # self.simple_abbreviation_candidates = re.compile(r"(?<![\w.])\p{L}{2,}\.(?!\p{L}{1,3}\.)")
        # abbreviation_list = [a[0] for a in abbrev_simple if not a[1]]
        self.abbreviation = re.compile(r"(?<![\p{L}.])(?:" +
                                       r"(?:(?:\p{L}\.){2,})" +
                                       r"|" +
                                       # r"(?i:" +    # this part should be case insensitive
                                       r'|'.join([re.escape(_) for _ in abbreviation_list]) +
                                       # r"))+(?!\p{L}{1,3}\.)", re.V1)
                                       r")+(?!\p{L}{1,3}\.)", re.IGNORECASE)

        # MENTIONS, HASHTAGS, ACTION WORDS, UNDERLINE
        self.mention = re.compile(r'[@]\w+(?!\w)')
        self.hashtag = re.compile(r'(?<!\w)[#]\w+(?!\w)')
        self.action_word = re.compile(r'(?<!\w)(?P<a_open>[*+])(?P<b_middle>[^\s*]+)(?P<c_close>[*])(?!\w)')
        # a pair of underscores can be used to "underline" some text
        self.underline = re.compile(r"(?<!\w)(_)(\w[^_]+\w)(_)(?!\w)")

        # DATE, TIME, NUMBERS
        self.three_part_date_year_first = re.compile(r'(?<![\d.]) (?P<a_year>\d{4}) (?P<b_month_or_day>([/-])\d{1,2}) (?P<c_day_or_month>\3\d{1,2}) (?![\d.])', re.VERBOSE)
        self.three_part_date_dmy = re.compile(r'(?<![\d.]) (?P<a_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])([./-])) (?P<b_month>(?:0?[1-9]|1[0-2])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])', re.VERBOSE)
        self.three_part_date_mdy = re.compile(r'(?<![\d.]) (?P<a_month>(?:0?[1-9]|1[0-2])([./-])) (?P<b_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])', re.VERBOSE)
        self.two_part_date = re.compile(r'(?<![\d.]) (?P<a_day_or_month>\d{1,2}([./-])) (?P<b_day_or_month>\d{1,2}\2) (?![\d.])', re.VERBOSE)
        self.time = re.compile(r'(?<!\w)\d{1,2}(?:(?::\d{2}){1,2}){1,2}(?![\d:])')
        self.en_time = re.compile(r'(?<![\w])(?P<a_time>\d{1,2}(?:(?:[.:]\d{2})){0,2}) ?(?P<b_am_pm>(?:[ap]m\b|[ap]\.m\.(?!\w)))', re.IGNORECASE)
        self.en_us_phone_number = re.compile(r"(?<![\d-])(?:[2-9]\d{2}[/-])?\d{3}-\d{4}(?![\d-])")
        self.en_numerical_identifiers = re.compile(r"(?<![\d-])\d+-(?:\d+-)+\d+(?![\d-])|(?<![\d/])\d+/(?:\d+/)+\d+(?![\d/])")
        self.en_us_zip_code = re.compile(r"(?<![\d-])\d{5}-\d{4}(?![\d-])")
        self.ordinal = re.compile(r'(?<![\w.])(?:\d{1,3}|\d{5,}|[3-9]\d{3})\.(?!\d)')
        self.english_ordinal = re.compile(r'\b(?:\d+(?:,\d+)*)?(?:1st|2nd|3rd|\dth)\b')
        self.english_decades = re.compile(r"\b(?:[12]\d)?\d0['’]?s\b")
        self.fraction = re.compile(r'(?<!\w)\d+/\d+(?![\d/])')
        self.amount = re.compile(r'(?<!\w)(?:\d+[\d,.]*-)(?!\w)')
        self.semester = re.compile(r'(?<!\w)(?P<a_semester>[WS]S|SoSe|WiSe)(?P<b_jahr>\d\d(?:/\d\d)?)(?!\w)', re.IGNORECASE)
        self.measurement = re.compile(r'(?<!\w)(?P<a_amount>[−+-]?\d*[,.]?\d+) ?(?P<b_unit>(?:mm|cm|dm|m|km)(?:\^?[23])?|bit|cent|eur|f|ft|g|ghz|h|hz|kg|l|lb|min|ml|qm|s|sek)(?!\w)', re.IGNORECASE)
        # auch Web2.0
        self.number_compound = re.compile(r'(?<!\w) (?:\d+-?[\p{L}@][\p{L}@-]* | [\p{L}@][\p{L}@-]*-?\d+(?:\.\d)?) (?!\w)', re.VERBOSE)
        self.number = re.compile(r"""(?<!\w|\d[.,]?)
                                     (?:[−+-]?              # optional sign
                                       (?:\d*               # optional digits before decimal point
                                       [.,])?               # optional decimal point
                                       \d+                  # digits
                                       (?:[eE][−+-]?\d+)?   # optional exponent
                                       |
                                       \d{1,3}(?:[.]\d{3})+(?:,\d+)?  # dot for thousands, comma for decimals: 1.999,95
                                       |
                                       \d{1,3}(?:,\d{3})+(?:[.]\d+)?  # comma for thousands, dot for decimals: 1,999.95
                                       )
                                     (?![.,]?\d)""", re.VERBOSE)
        self.ipv4 = re.compile(r"(?<!\w|\d[.,]?)(?:\d{1,3}[.]){3}\d{1,3}(?![.,]?\d)")
        self.section_number = re.compile(r"(?<!\w|\d[.,]?)(?:\d+[.])+\d+[.]?(?![.,]?\d)")

        # PUNCTUATION
        self.quest_exclam = re.compile(r"([!?]+)")
        # arrows
        self.space_right_arrow = re.compile(r'(-+)\s+(>)')
        self.space_left_arrow = re.compile(r'(<)\s+(-+)')
        self.arrow = re.compile(r'(-+>|<-+|[\u2190-\u21ff])')
        # parens
        self.paired_paren = re.compile(r'([(])(?!inn)([^()]*)([)])')
        self.paired_bracket = re.compile(r'(\[)([^][]*)(\])')
        self.paren = re.compile(r"""((?:(?<!\w)   # no alphanumeric character
                                       [[{(]      # opening paren
                                       (?=\w)) |  # alphanumeric character
                                     (?:(?<=\w)   # alphanumeric character
                                       []})]      # closing paren
                                       (?!\w)) |  # no alphanumeric character
                                     (?:(?<=\s)   # space
                                       []})]      # closing paren
                                       (?=\w)) |  # alphanumeric character
                                     (?:(?<=\w-)  # hyphen
                                       [)]        # closing paren
                                       (?=\w)))   # alphanumeric character
                                 """, re.VERBOSE)
        self.all_paren = re.compile(r"(?<=\s)[][(){}](?=\s)")
        self.de_slash = re.compile(r'(/+)(?!in(?:nen)?|en)')
        # English possessive and contracted forms
        self.en_trailing_apos = re.compile(r"(?<!..in|')(['’])(?!\w)")
        self.en_dms = re.compile(r"(?<=\w)(['’][dms])\b", re.IGNORECASE)
        self.en_llreve = re.compile(r"(?<=\w)(['’](?:ll|re|ve))\b", re.IGNORECASE)
        self.en_not = re.compile(r"(?<=\w)(n['’]t)\b", re.IGNORECASE)
        en_twopart_contractions = [r"\b(a)(lot)\b", r"\b(gon)(na)\b", r"\b(got)(ta)\b", r"\b(lem)(me)\b",
                                   r"\b(out)(ta)\b", r"\b(wan)(na)\b", r"\b(c'm)(on)\b",
                                   r"\b(more)(['’]n)\b", r"\b(d['’])(ye)\b", r"(?<!\w)(['’]t)(is)\b",
                                   r"(?<!\w)(['’]t)(was)\b", r"\b(there)(s)\b", r"\b(i)(m)\b",
                                   r"\b(you)(re)\b", r"\b(he)(s)\b", r"\b(she)(s)\b",
                                   r"\b(ai)(nt)\b", r"\b(are)(nt)\b", r"\b(is)(nt)\b",
                                   r"\b(do)(nt)\b", r"\b(does)(nt)\b", r"\b(did)(nt)\b",
                                   r"\b(i)(ve)\b", r"\b(you)(ve)\b", r"\b(they)(ve)\b",
                                   r"\b(have)(nt)\b", r"\b(has)(nt)\b", r"\b(can)(not)\b",
                                   r"\b(ca)(nt)\b", r"\b(could)(nt)\b", r"\b(wo)(nt)\b",
                                   r"\b(would)(nt)\b", r"\b(you)(ll)\b", r"\b(let)(s)\b"]
        en_threepart_contractions = [r"\b(du)(n)(no)\b", r"\b(wha)(dd)(ya)\b", r"\b(wha)(t)(cha)\b", r"\b(i)('m)(a)\b"]
        # w/o, w/out, b/c, b/t, l/c, w/, d/c, u/s
        self.en_slash_words = re.compile(r"\b(?:w/o|w/out|b/t|l/c|b/c|d/c|u/s)\b|\bw/(?!\w)", re.IGNORECASE)
        # word--word
        self.en_double_hyphen = re.compile(r"(?<=\w)--+(?=\w)")
        self.en_twopart_contractions = [re.compile(contr, re.IGNORECASE) for contr in en_twopart_contractions]
        self.en_threepart_contractions = [re.compile(contr, re.IGNORECASE) for contr in en_threepart_contractions]
        # English hyphenated words
        if self.language == "en":
            nonbreaking_prefixes = utils.read_abbreviation_file("non-breaking_prefixes_%s.txt" % self.language)
            nonbreaking_suffixes = utils.read_abbreviation_file("non-breaking_suffixes_%s.txt" % self.language)
            nonbreaking_words = utils.read_abbreviation_file("non-breaking_hyphenated_words_%s.txt" % self.language)
            self.en_nonbreaking_prefixes = re.compile(r"(?<![\w-])(?:" + r'|'.join([re.escape(_) for _ in nonbreaking_prefixes]) + r")-[\w-]+", re.IGNORECASE)
            self.en_nonbreaking_suffixes = re.compile(r"\b[\w-]+-(?:" + r'|'.join([re.escape(_) for _ in nonbreaking_suffixes]) + r")(?![\w-])", re.IGNORECASE)
            self.en_nonbreaking_words = re.compile(r"\b(?:" + r'|'.join([re.escape(_) for _ in nonbreaking_words]) + r")\b", re.IGNORECASE)
        self.hyphen = re.compile(r"(?<=\w)(-)(?=\w)")
        self.en_no = re.compile(r"\b(no\.)\s*(?=\d)", re.IGNORECASE)
        self.en_degree = re.compile(r"(?<=\d ?)°(?:F|C|Oe)\b", re.IGNORECASE)
        # quotation marks
        # L'Enfer, d'accord, O'Connor
        self.letter_apostrophe_word = re.compile(r"\b([dlo]['’]\p{L}+)\b", re.IGNORECASE)
        self.paired_double_latex_quote = re.compile(r"(?<!`)(``)([^`']+)('')(?!')")
        self.paired_single_latex_quote = re.compile(r"(?<!`)(`)([^`']+)(')(?!')")
        self.paired_single_quot_mark = re.compile(r"(['‚‘’])([^']+)(['‘’])")
        self.all_quote = re.compile(r"(?<=\s)(?:``|''|`|['‚‘’])(?=\s)")
        self.other_punctuation = re.compile(r'([#<>%‰€$£₤¥°@~*„“”‚‘"»«›‹,;:+×÷±≤≥=&–—])')
        self.en_quotation_marks = re.compile(r'([„“”‚‘’"»«›‹])')
        self.en_other_punctuation = re.compile(r'([#<>%‰€$£₤¥°@~*,;:+×÷±≤≥=&/–—-]+)')
        self.ellipsis = re.compile(r'\.{2,}|…+(?:\.{2,})?')
        self.dot_without_space = re.compile(r'(?<=\p{Ll}{2})(\.)(?=\p{Lu}\p{Ll}{2})')
        # self.dot = re.compile(r'(?<=[\w)])(\.)(?![\w])')
        self.dot = re.compile(r'(\.)')
Esempio n. 3
0
    def __init__(self,
                 split_camel_case=False,
                 token_classes=False,
                 extra_info=False):
        """Create a Tokenizer object. If split_camel_case is set to True,
        tokens written in CamelCase will be split. If token_classes is
        set to true, the tokenizer will output the token class for
        each token (if it is a number, an XML tag, an abbreviation,
        etc.). If extra_info is set to True, the tokenizer will output
        information about the original spelling of the tokens.

        """
        self.split_camel_case = split_camel_case
        self.token_classes = token_classes
        self.extra_info = extra_info
        self.unique_string_length = 7
        self.mapping = {}
        self.unique_prefix = None
        self.replacement_counter = 0

        self.spaces = re.compile(r"\s+")
        self.controls = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
        # soft hyphen (00AD), zero-width space (200B)
        self.other_nasties = re.compile(r"[\u00AD\u200B]")
        # combination
        self.starts_with_junk = re.compile(
            r"^[\u0000-\u001F\u007F-\u009F\u00AD\u200B]+")
        self.junk_between_spaces = re.compile(
            r"(?:^|\s+)[\s\u0000-\u001F\u007F-\u009F\u00AD\u200B]+(?:\s+|$)")

        # TAGS, EMAILS, URLs
        # self.tag = re.compile(r'<(?!-)(?:/[^> ]+|[^>]+/?)(?<!-)>')
        # taken from Regular Expressions Cookbook
        self.tag = re.compile(
            r"""
                                  <
                                  (?:                  # Branch for opening tags:
                                    ([_:A-Z][-.:\w]*)  #   Capture the opening tag name to backreference 1
                                    (?:                #   This group permits zero or more attributes
                                      \s+              #   Whitespace to separate attributes
                                      [_:A-Z][-.:\w]*  #   Attribute name
                                      \s*=\s*          #   Attribute name-value delimiter
                                      (?: "[^"”“]*"      #   Double-quoted attribute value
                                        | '[^']*'      #   Single-quoted attribute value
                                      )
                                    )*
                                    \s*                #   Permit trailing whitespace
                                    /?                 #   Permit self-closed tags
                                  |                    # Branch for closing tags:
                                    /
                                    ([_:A-Z][-.:\w]*)  #   Capture the closing tag name to backreference 2
                                    \s*                #   Permit trailing whitespace
                                  )
                                  >
        """, re.VERBOSE | re.IGNORECASE)
        # regex for email addresses taken from:
        # http://www.regular-expressions.info/email.html
        # self.email = re.compile(r"\b[[:alnum:].%+-]+@[[:alnum:].-]+\.[[:alpha:]]{2,}\b")
        self.email = re.compile(
            r"\b[[:alnum:].%+-]+(?:@| \[?at\]? )[[:alnum:].-]+(?:\.| \[?dot\]? )[[:alpha:]]{2,}\b"
        )
        # simple regex for urls that start with http or www
        # TODO: schließende Klammer am Ende erlauben, wenn nach http etc. eine öffnende kam
        self.simple_url_with_brackets = re.compile(
            r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)\S+?\(\S*?\)\S*(?=$|[\'. "!?,;\n\t])',
            re.IGNORECASE)
        self.simple_url = re.compile(
            r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)\S+[^\'. "!?,;:\n\t]',
            re.IGNORECASE)
        self.doi = re.compile(r'\bdoi:10\.\d+/\S+', re.IGNORECASE)
        self.doi_with_space = re.compile(r'(?<=\bdoi: )10\.\d+/\S+',
                                         re.IGNORECASE)
        # we also allow things like tagesschau.de-App
        self.url_without_protocol = re.compile(
            r'\b[\w./-]+\.(?:de|com|org|net|edu|info|jpg|png|gif|log|txt)(?:-\w+)?\b',
            re.IGNORECASE)

        # XML entities
        self.entity_name = re.compile(r'&(?:quot|amp|apos|lt|gt);',
                                      re.IGNORECASE)
        self.entity_decimal = re.compile(r'&#\d+;')
        self.entity_hex = re.compile(r'&#x[0-9a-f]+;', re.IGNORECASE)

        # EMOTICONS
        # TODO: Peter, SMS von gestern Nacht -> hauptsächlich entities -> hilft nicht so wahnsinnig.
        emoticon_set = set([
            "(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:", "(-:", ")=", ")o:", ")x",
            ":'C", ":/", ":<", ":C", ":[", "=(", "=)", "=D", "=P", ">:", "D':",
            "D:", "\:", "]:", "x(", "^^", "o.O", "oO", "\O/", "\m/", ":;))",
            "_))", "*_*", "._.", ":wink:", ">_<", "*<:-)", ":!:", ":;-))"
        ])
        emoticon_list = sorted(emoticon_set, key=len, reverse=True)
        self.emoticon = re.compile(
            r"""(?:(?:[:;]|(?<!\d)8)           # a variety of eyes, alt.: [:;8]
                                        [-'oO]?                       # optional nose or tear
                                        (?: \)+ | \(+ | [*] | ([DPp])\1*(?!\w)))   # a variety of mouths
                                    """ + r"|" + r"(?:xD+|XD+)" + r"|" +
            r"|".join([re.escape(_) for _ in emoticon_list]), re.VERBOSE)
        self.space_emoticon = re.compile(r'([:;])[ ]+([()])')
        # ^3 is an emoticon, unless it is preceded by a number (with
        # optional whitespace between number and ^3)
        # ^\^3    # beginning of line, no leading characters
        # ^\D^3   # beginning of line, one leading character
        # (?<=\D[ ])^3   # two leading characters, non-number + space
        # (?<=.[^\d ])^3   # two leading characters, x + non-space-non-number
        self.heart_emoticon = re.compile(
            r"(?:^|^\D|(?<=\D[ ])|(?<=.[^\d ]))\^3")
        # U+2600..U+26FF	Miscellaneous Symbols
        # U+2700..U+27BF	Dingbats
        # U+1F300..U+1F5FF	Miscellaneous Symbols and Pictographs
        # U+1F600..U+1F64F	Emoticons
        # U+1F680..U+1F6FF	Transport and Map Symbols
        # U+1F900..U+1F9FF	Supplemental Symbols and Pictographs
        # self.unicode_symbols = re.compile(r"[\u2600-\u27BF]|[\u1F300-\u1F64F]|[\u1F680-\u1F6FF]|[\u1F900-\u1F9FF]")
        self.unicode_symbols = re.compile(
            r"[\u2600-\u27BF\U0001F300-\U0001f64f\U0001F680-\U0001F6FF\U0001F900-\U0001F9FF]"
        )

        # special tokens containing + or &
        tokens_with_plus_or_ampersand = utils.read_abbreviation_file(
            "tokens_with_plus_or_ampersand.txt")
        plus_amp_simple = [(pa, re.search(r"^\w+[&+]\w+$", pa))
                           for pa in tokens_with_plus_or_ampersand]
        self.simple_plus_ampersand = set(
            [pa[0].lower() for pa in plus_amp_simple if pa[1]])
        self.simple_plus_ampersand_candidates = re.compile(r"\b\w+[&+]\w+\b")
        tokens_with_plus_or_ampersand = [
            pa[0] for pa in plus_amp_simple if not pa[1]
        ]
        # self.token_with_plus_ampersand = re.compile(r"(?<!\w)(?:\L<patokens>)(?!\w)", re.IGNORECASE, patokens=tokens_with_plus_or_ampersand)
        self.token_with_plus_ampersand = re.compile(
            r"(?<!\w)(?:" +
            r"|".join([re.escape(_)
                       for _ in tokens_with_plus_or_ampersand]) + r")(?!\w)",
            re.IGNORECASE)

        # camelCase
        self.emoji = re.compile(r'\bemojiQ[[:alpha:]]{3,}\b')
        camel_case_token_list = utils.read_abbreviation_file(
            "camel_case_tokens.txt")
        cc_alnum = [(cc, re.search(r"^\w+$", cc))
                    for cc in camel_case_token_list]
        self.simple_camel_case_tokens = set(
            [cc[0] for cc in cc_alnum if cc[1]])
        self.simple_camel_case_candidates = re.compile(
            r"\b\w*[[:lower:]][[:upper:]]\w*\b")
        camel_case_token_list = [cc[0] for cc in cc_alnum if not cc[1]]
        # things like ImmobilienScout24.de are already covered by URL detection
        # self.camel_case_url = re.compile(r'\b(?:[[:upper:]][[:lower:][:digit:]]+){2,}\.(?:de|com|org|net|edu)\b')
        self.camel_case_token = re.compile(
            r"\b(?:" + r"|".join([re.escape(_)
                                  for _ in camel_case_token_list]) +
            r"|:Mac[[:upper:]][[:lower:]]*)\b")
        # self.camel_case_token = re.compile(r"\b(?:\L<cctokens>|Mac[[:upper:]][[:lower:]]*)\b", cctokens=camel_case_token_set)
        self.in_and_innen = re.compile(
            r'\b[[:alpha:]]+[[:lower:]]In(?:nen)?[[:lower:]]*\b')
        self.camel_case = re.compile(
            r'(?<=[[:lower:]]{2})([[:upper:]])(?![[:upper:]]|\b)')

        # ABBREVIATIONS
        self.single_letter_ellipsis = re.compile(
            r"(?<![\w.])(?P<a_letter>[[:alpha:]])(?P<b_ellipsis>\.{3})(?!\.)")
        self.and_cetera = re.compile(r"(?<![\w.&])&c\.(?![[:alpha:]]{1,3}\.)")
        self.str_abbreviations = re.compile(
            r'(?<![\w.])([[:alpha:]-]+-Str\.)(?![[:alpha:]])', re.IGNORECASE)
        self.nr_abbreviations = re.compile(
            r"(?<![\w.])(\w+\.-?Nr\.)(?![[:alpha:]]{1,3}\.)", re.IGNORECASE)
        self.single_letter_abbreviation = re.compile(
            r"(?<![\w.])[[:alpha:]]\.(?![[:alpha:]]{1,3}\.)")
        # abbreviations with multiple dots that constitute tokens
        single_token_abbreviation_list = utils.read_abbreviation_file(
            "single_token_abbreviations.txt")
        self.single_token_abbreviation = re.compile(
            r"(?<![\w.])(?:" +
            r'|'.join([re.escape(_) for _ in single_token_abbreviation_list]) +
            r')(?![[:alpha:]]{1,3}\.)')
        self.ps = re.compile(r"(?<!\d[ ])\bps\.", re.IGNORECASE)
        self.multipart_abbreviation = re.compile(r'(?:[[:alpha:]]+\.){2,}')
        # only abbreviations that are not matched by (?:[[:alpha:]]\.)+
        abbreviation_list = utils.read_abbreviation_file("abbreviations.txt")
        # abbrev_simple = [(a, re.search(r"^[[:alpha:]]{2,}\.$", a)) for a in abbreviation_list]
        # self.simple_abbreviations = set([a[0].lower() for a in abbrev_simple if a[1]])
        # self.simple_abbreviation_candidates = re.compile(r"(?<![\w.])[[:alpha:]]{2,}\.(?![[:alpha:]]{1,3}\.)")
        # abbreviation_list = [a[0] for a in abbrev_simple if not a[1]]
        self.abbreviation = re.compile(
            r"(?<![\w.])(?:" + r"(?:(?:[[:alpha:]]\.){2,})" + r"|" +
            # r"(?i:" +    # this part should be case insensitive
            r'|'.join([re.escape(_) for _ in abbreviation_list]) +
            # r"))+(?![[:alpha:]]{1,3}\.)", re.V1)
            r")+(?![[:alpha:]]{1,3}\.)",
            re.IGNORECASE)

        # MENTIONS, HASHTAGS, ACTION WORDS
        self.mention = re.compile(r'[@]\w+(?!\w)')
        self.hashtag = re.compile(r'(?<!\w)[#]\w+(?!\w)')
        # action words without spaces are to be treated as units
        self.action_word = re.compile(
            r'(?<!\w)(?P<a_open>[*+])(?P<b_middle>[^\s*]+)(?P<c_close>[*])(?!\w)'
        )

        # DATE, TIME, NUMBERS
        self.three_part_date_year_first = re.compile(
            r'(?<![\d.]) (?P<a_year>\d{4}) (?P<b_month_or_day>([/-])\d{1,2}) (?P<c_day_or_month>\3\d{1,2}) (?![\d.])',
            re.VERBOSE)
        self.three_part_date_dmy = re.compile(
            r'(?<![\d.]) (?P<a_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])([./-])) (?P<b_month>(?:0?[1-9]|1[0-2])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])',
            re.VERBOSE)
        self.three_part_date_mdy = re.compile(
            r'(?<![\d.]) (?P<a_month>(?:0?[1-9]|1[0-2])([./-])) (?P<b_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])',
            re.VERBOSE)
        self.two_part_date = re.compile(
            r'(?<![\d.]) (?P<a_day_or_month>\d{1,2}([./-])) (?P<b_day_or_month>\d{1,2}\2) (?![\d.])',
            re.VERBOSE)
        self.time = re.compile(r'(?<!\w)\d{1,2}(?::\d{2}){1,2}(?![\d:])')
        self.ordinal = re.compile(
            r'(?<![\w.])(?:\d{1,3}|\d{5,}|[3-9]\d{3})\.(?!\d)')
        self.fraction = re.compile(r'(?<!\w)\d+/\d+(?![\d/])')
        self.amount = re.compile(r'(?<!\w)(?:\d+[\d,.]*-)(?!\w)')
        self.semester = re.compile(
            r'(?<!\w)(?P<a_semester>[WS]S|SoSe|WiSe)(?P<b_jahr>\d\d(?:/\d\d)?)(?!\w)',
            re.IGNORECASE)
        self.measurement = re.compile(
            r'(?<!\w)(?P<a_amount>[−+-]?\d*[,.]?\d+)(?P<b_unit>(?:mm|cm|dm|m|km)(?:\^?[23])?|qm|g|kg|min|h|s|sek|cent|eur)(?!\w)',
            re.IGNORECASE)
        # auch Web2.0
        self.number_compound = re.compile(
            r'(?<!\w) (?:\d+-?[[:alpha:]@]+ | [[:alpha:]@]+-?\d+(?:\.\d)?) (?!\w)',
            re.VERBOSE)
        self.number = re.compile(
            r"""(?<!\w)
                                     (?:[−+-]?              # optional sign
                                       \d*                  # optional digits before decimal point
                                       [.,]?                # optional decimal point
                                       \d+                  # digits
                                       (?:[eE][−+-]?\d+)?   # optional exponent
                                       |
                                       \d+[\d.,]*\d+)
                                     (?![.,]?\d)""", re.VERBOSE)

        # PUNCTUATION
        self.quest_exclam = re.compile(r"([!?]+)")
        # arrows
        self.space_right_arrow = re.compile(r'(-+)\s+(>)')
        self.space_left_arrow = re.compile(r'(<)\s+(-+)')
        self.arrow = re.compile(r'(-+>|<-+|[\u2190-\u21ff])')
        # parens
        self.paired_paren = re.compile(r'([(])(?!inn)([^()]*)([)])')
        self.paired_bracket = re.compile(r'(\[)([^][]*)(\])')
        self.paren = re.compile(
            r"""((?:(?<!\w)   # no alphanumeric character
                                       [[{(]      # opening paren
                                       (?=\w)) |  # alphanumeric character
                                     (?:(?<=\w)   # alphanumeric character
                                       []})]      # closing paren
                                       (?!\w)) |  # no alphanumeric character
                                     (?:(?<=\s)   # space
                                       []})]      # closing paren
                                       (?=\w)) |  # alphanumeric character
                                     (?:(?<=\w-)  # hyphen
                                       [)]        # closing paren
                                       (?=\w)))   # alphanumeric character
                                 """, re.VERBOSE)
        self.all_paren = re.compile(r"(?<=\s)[][(){}](?=\s)")
        self.slash = re.compile(r'(/+)(?!in(?:nen)?|en)')
        self.paired_double_latex_quote = re.compile(
            r"(?<!`)(``)([^`']+)('')(?!')")
        self.paired_single_latex_quote = re.compile(
            r"(?<!`)(`)([^`']+)(')(?!')")
        self.paired_single_quot_mark = re.compile(r"(['‚‘’])([^']+)(['‘’])")
        self.all_quote = re.compile(r"(?<=\s)(?:``|''|`|['‚‘’])(?=\s)")
        self.other_punctuation = re.compile(
            r'([<>%‰€$£₤¥°@~*„“”‚‘"»«›‹,;:+=&–])')
        self.ellipsis = re.compile(r'\.{2,}|…+(?:\.{2,})?')
        self.dot_without_space = re.compile(
            r'(?<=[[:lower:]]{2})(\.)(?=[[:upper:]][[:lower:]]{2})')
        # self.dot = re.compile(r'(?<=[\w)])(\.)(?![\w])')
        self.dot = re.compile(r'(\.)')