def __init__(self, is_tuple=False): """Create a SentenceSplitter object. If the tokenized paragraphs contain token classes or extra info, set is_tuple=True. """ self.is_tuple = is_tuple # full stop, ellipsis, exclamation and question marks self.sentence_ending_punct = re.compile(r"^(?:\.+|…+\.*|[!?]+)$") self.opening_punct = re.compile(r"^(?:['\"¿¡\p{Pi}\p{Ps}–—]|-{2,})$") self.closing_punct = re.compile(r"^(?:['\"“\p{Pf}\p{Pe}])$") self.eos_abbreviations = utils.read_abbreviation_file( "eos_abbreviations.txt")
def __init__(self, split_camel_case=False, token_classes=False, extra_info=False, language="de"): """Create a Tokenizer object. If split_camel_case is set to True, tokens written in CamelCase will be split. If token_classes is set to true, the tokenizer will output the token class for each token (if it is a number, an XML tag, an abbreviation, etc.). If extra_info is set to True, the tokenizer will output information about the original spelling of the tokens. """ self.split_camel_case = split_camel_case self.token_classes = token_classes self.extra_info = extra_info self.language = language if language in self.supported_languages else self.default_language self.unique_string_length = 7 self.mapping = {} self.unique_prefix = None self.replacement_counter = 0 self.spaces = re.compile(r"\s+") self.controls = re.compile(r"[\u0000-\u001F\u007F-\u009F]") self.stranded_variation_selector = re.compile(r" \uFE0F") # soft hyphen (00AD), zero-width space (200B), zero-width # non-joiner (200C), zero-width joiner (200D), Arabic letter # mark (061C), left-to-right mark (200E), right-to-left mark # (200F), word joiner (2060), left-to-right isolate (2066), # right-to-left isolate (2067), first strong isolate (2068), # pop directional isolate (2069), l-t-r/r-t-l embedding (202A, # 202B), l-t-r/r-t-l override (202D, 202E), pop directional # formatting (202C), zero-width no-break space (FEFF) self.other_nasties = re.compile(r"[\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]") # combination self.starts_with_junk = re.compile(r"^[\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]+") self.junk_next_to_space = re.compile(r"(?:^|\s)[\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]+|[\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]+(?:\s|$)") self.junk_between_spaces = re.compile(r"(?:^|\s+)[\s\u0000-\u001F\u007F-\u009F\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]+(?:\s+|$)") # My Additions self.letter_hyphen = re.compile(r'\b\p{Lu}-\p{L}{3,}\b') # TAGS, EMAILS, URLs self.xml_declaration = re.compile(r"""<\?xml (?: # This group permits zero or more attributes \s+ # Whitespace to separate attributes [_:A-Z][-.:\w]* # Attribute name \s*=\s* # Attribute name-value delimiter (?: "[^"]*" # Double-quoted attribute value | '[^']*' # Single-quoted attribute value ) )* \s* # Permit trailing whitespace \?>""", re.VERBOSE | re.IGNORECASE) # self.tag = re.compile(r'<(?!-)(?:/[^> ]+|[^>]+/?)(?<!-)>') # taken from Regular Expressions Cookbook self.tag = re.compile(r""" < (?: # Branch for opening tags: ([_:A-Z][-.:\w]*) # Capture the opening tag name to backreference 1 (?: # This group permits zero or more attributes \s+ # Whitespace to separate attributes [_:A-Z][-.:\w]* # Attribute name \s*=\s* # Attribute name-value delimiter (?: "[^"]*" # Double-quoted attribute value | '[^']*' # Single-quoted attribute value ) )* \s* # Permit trailing whitespace /? # Permit self-closed tags | # Branch for closing tags: / ([_:A-Z][-.:\w]*) # Capture the closing tag name to backreference 2 \s* # Permit trailing whitespace ) > """, re.VERBOSE | re.IGNORECASE) # regex for email addresses taken from: # http://www.regular-expressions.info/email.html # self.email = re.compile(r"\b[\w.%+-]+@[\w.-]+\.\p{L}{2,}\b") self.email = re.compile(r"\b[\w.%+-]+(?:@| \[at\] )[\w.-]+(?:\.| \[?dot\]? )\p{L}{2,}\b") # simple regex for urls that start with http or www # TODO: schließende Klammer am Ende erlauben, wenn nach http etc. eine öffnende kam self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)\S+?\(\S*?\)\S*(?=$|[\'. "!?,;\n\t])', re.IGNORECASE) self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)\S+[^\'. "!?,;:\n\t)]', re.IGNORECASE) self.doi = re.compile(r'\bdoi:10\.\d+/\S+', re.IGNORECASE) self.doi_with_space = re.compile(r'(?<=\bdoi: )10\.\d+/\S+', re.IGNORECASE) # we also allow things like tagesschau.de-App self.url_without_protocol = re.compile(r'\b[\w./-]+\.(?:de|com|org|net|edu|info|gov|jpg|png|gif|log|txt|xlsx?|docx?|pptx?|pdf)(?:-\w+)?\b', re.IGNORECASE) self.reddit_links = re.compile(r'(?<!\w)/?[rlu](?:/\w+)+/?(?!\w)', re.IGNORECASE) # XML entities self.entity_name = re.compile(r'&(?:quot|amp|apos|lt|gt);', re.IGNORECASE) self.entity_decimal = re.compile(r'&#\d+;') self.entity_hex = re.compile(r'&#x[0-9a-f]+;', re.IGNORECASE) # EMOTICONS emoticon_set = set(["(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:", "(-:", ")=", ")o:", ")x", ":'C", ":/", ":<", ":C", ":[", "=(", "=)", "=D", "=P", ">:", "\\:", "]:", "x(", "^^", "o.O", "\\O/", "\\m/", ":;))", "_))", "*_*", "._.", ":wink:", ">_<", "*<:-)", ":!:", ":;-))"]) emoticon_list = sorted(emoticon_set, key=len, reverse=True) self.emoticon = re.compile(r"""(?:(?:[:;]|(?<!\d)8) # a variety of eyes, alt.: [:;8] [-'oO]? # optional nose or tear (?: \)+ | \(+ | [*] | ([DPp])\1*(?!\w))) # a variety of mouths """ + r"|" + r"(?:\b[Xx]D+\b)" + r"|" + r"(?:\b(?:D'?:|oO)\b)" + r"|" + r"|".join([re.escape(_) for _ in emoticon_list]), re.VERBOSE) self.space_emoticon = re.compile(r'([:;])[ ]+([()])(?! *[\+0])') # ^3 is an emoticon, unless it is preceded by a number (with # optional whitespace between number and ^3) # ^\^3 # beginning of line, no leading characters # ^\D^3 # beginning of line, one leading character # (?<=\D[ ])^3 # two leading characters, non-number + space # (?<=.[^\d ])^3 # two leading characters, x + non-space-non-number self.heart_emoticon = re.compile(r"(?:^|^\D|(?<=\D[ ])|(?<=.[^\d ]))\^3") # U+2600..U+26FF Miscellaneous Symbols # U+2700..U+27BF Dingbats # U+FE0E..U+FE0F text and emoji variation selectors # U+1F300..U+1F5FF Miscellaneous Symbols and Pictographs # -> U+1F3FB..U+1F3FF Emoji modifiers (skin tones) # U+1F600..U+1F64F Emoticons # U+1F680..U+1F6FF Transport and Map Symbols # U+1F900..U+1F9FF Supplemental Symbols and Pictographs # self.unicode_symbols = re.compile(r"[\u2600-\u27BF\uFE0E\uFE0F\U0001F300-\U0001f64f\U0001F680-\U0001F6FF\U0001F900-\U0001F9FF]") self.unicode_flags = re.compile(r"\p{Regional_Indicator}{2}\uFE0F?") # special tokens containing + or & tokens_with_plus_or_ampersand = utils.read_abbreviation_file("tokens_with_plus_or_ampersand.txt") plus_amp_simple = [(pa, re.search(r"^\w+[&+]\w+$", pa)) for pa in tokens_with_plus_or_ampersand] self.simple_plus_ampersand = set([pa[0].lower() for pa in plus_amp_simple if pa[1]]) self.simple_plus_ampersand_candidates = re.compile(r"\b\w+[&+]\w+\b") tokens_with_plus_or_ampersand = [pa[0] for pa in plus_amp_simple if not pa[1]] # self.token_with_plus_ampersand = re.compile(r"(?<!\w)(?:\L<patokens>)(?!\w)", re.IGNORECASE, patokens=tokens_with_plus_or_ampersand) self.token_with_plus_ampersand = re.compile(r"(?<!\w)(?:" + r"|".join([re.escape(_) for _ in tokens_with_plus_or_ampersand]) + r")(?!\w)", re.IGNORECASE) # camelCase self.emoji = re.compile(r'\bemojiQ\p{L}{3,}\b') camel_case_token_list = utils.read_abbreviation_file("camel_case_tokens.txt") cc_alnum = [(cc, re.search(r"^\w+$", cc)) for cc in camel_case_token_list] self.simple_camel_case_tokens = set([cc[0] for cc in cc_alnum if cc[1]]) self.simple_camel_case_candidates = re.compile(r"\b\w*\p{Ll}\p{Lu}\w*\b") camel_case_token_list = [cc[0] for cc in cc_alnum if not cc[1]] # things like ImmobilienScout24.de are already covered by URL detection # self.camel_case_url = re.compile(r'\b(?:\p{Lu}[\p{Ll}\d]+){2,}\.(?:de|com|org|net|edu)\b') self.camel_case_token = re.compile(r"\b(?:" + r"|".join([re.escape(_) for _ in camel_case_token_list]) + r"|:Mac\p{Lu}\p{Ll}*)\b") # self.camel_case_token = re.compile(r"\b(?:\L<cctokens>|Mac\p{Lu}\p{Ll}*)\b", cctokens=camel_case_token_set) self.in_and_innen = re.compile(r'\b\p{L}+\p{Ll}In(?:nen)?\p{Ll}*\b') self.camel_case = re.compile(r'(?<=\p{Ll}{2})(\p{Lu})(?!\p{Lu}|\b)') # GENDER STAR self.gender_star = re.compile(r'\b\p{L}+\*in(?:nen)?\p{Ll}*\b', re.IGNORECASE) # ABBREVIATIONS self.single_letter_ellipsis = re.compile(r"(?<![\w.])(?P<a_letter>\p{L})(?P<b_ellipsis>\.{3})(?!\.)") self.and_cetera = re.compile(r"(?<![\w.&])&c\.(?!\p{L}{1,3}\.)") self.str_abbreviations = re.compile(r'(?<![\w.])([\p{L}-]+-Str\.)(?!\p{L})', re.IGNORECASE) self.nr_abbreviations = re.compile(r"(?<![\w.])(\w+\.-?Nr\.)(?!\p{L}{1,3}\.)", re.IGNORECASE) self.single_letter_abbreviation = re.compile(r"(?<![\w.])\p{L}\.(?!\p{L}{1,3}\.)") # abbreviations with multiple dots that constitute tokens single_token_abbreviation_list = utils.read_abbreviation_file("single_token_abbreviations_%s.txt" % self.language) self.single_token_abbreviation = re.compile(r"(?<![\w.])(?:" + r'|'.join([re.escape(_) for _ in single_token_abbreviation_list]) + r')(?!\p{L}{1,3}\.)', re.IGNORECASE) self.ps = re.compile(r"(?<!\d[ ])\bps\.", re.IGNORECASE) self.multipart_abbreviation = re.compile(r'(?:\p{L}+\.){2,}') # only abbreviations that are not matched by (?:\p{L}\.)+ abbreviation_list = utils.read_abbreviation_file("abbreviations_%s.txt" % self.language) # abbrev_simple = [(a, re.search(r"^\p{L}{2,}\.$", a)) for a in abbreviation_list] # self.simple_abbreviations = set([a[0].lower() for a in abbrev_simple if a[1]]) # self.simple_abbreviation_candidates = re.compile(r"(?<![\w.])\p{L}{2,}\.(?!\p{L}{1,3}\.)") # abbreviation_list = [a[0] for a in abbrev_simple if not a[1]] self.abbreviation = re.compile(r"(?<![\p{L}.])(?:" + r"(?:(?:\p{L}\.){2,})" + r"|" + # r"(?i:" + # this part should be case insensitive r'|'.join([re.escape(_) for _ in abbreviation_list]) + # r"))+(?!\p{L}{1,3}\.)", re.V1) r")+(?!\p{L}{1,3}\.)", re.IGNORECASE) # MENTIONS, HASHTAGS, ACTION WORDS, UNDERLINE self.mention = re.compile(r'[@]\w+(?!\w)') self.hashtag = re.compile(r'(?<!\w)[#]\w+(?!\w)') self.action_word = re.compile(r'(?<!\w)(?P<a_open>[*+])(?P<b_middle>[^\s*]+)(?P<c_close>[*])(?!\w)') # a pair of underscores can be used to "underline" some text self.underline = re.compile(r"(?<!\w)(_)(\w[^_]+\w)(_)(?!\w)") # DATE, TIME, NUMBERS self.three_part_date_year_first = re.compile(r'(?<![\d.]) (?P<a_year>\d{4}) (?P<b_month_or_day>([/-])\d{1,2}) (?P<c_day_or_month>\3\d{1,2}) (?![\d.])', re.VERBOSE) self.three_part_date_dmy = re.compile(r'(?<![\d.]) (?P<a_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])([./-])) (?P<b_month>(?:0?[1-9]|1[0-2])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])', re.VERBOSE) self.three_part_date_mdy = re.compile(r'(?<![\d.]) (?P<a_month>(?:0?[1-9]|1[0-2])([./-])) (?P<b_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])', re.VERBOSE) self.two_part_date = re.compile(r'(?<![\d.]) (?P<a_day_or_month>\d{1,2}([./-])) (?P<b_day_or_month>\d{1,2}\2) (?![\d.])', re.VERBOSE) self.time = re.compile(r'(?<!\w)\d{1,2}(?:(?::\d{2}){1,2}){1,2}(?![\d:])') self.en_time = re.compile(r'(?<![\w])(?P<a_time>\d{1,2}(?:(?:[.:]\d{2})){0,2}) ?(?P<b_am_pm>(?:[ap]m\b|[ap]\.m\.(?!\w)))', re.IGNORECASE) self.en_us_phone_number = re.compile(r"(?<![\d-])(?:[2-9]\d{2}[/-])?\d{3}-\d{4}(?![\d-])") self.en_numerical_identifiers = re.compile(r"(?<![\d-])\d+-(?:\d+-)+\d+(?![\d-])|(?<![\d/])\d+/(?:\d+/)+\d+(?![\d/])") self.en_us_zip_code = re.compile(r"(?<![\d-])\d{5}-\d{4}(?![\d-])") self.ordinal = re.compile(r'(?<![\w.])(?:\d{1,3}|\d{5,}|[3-9]\d{3})\.(?!\d)') self.english_ordinal = re.compile(r'\b(?:\d+(?:,\d+)*)?(?:1st|2nd|3rd|\dth)\b') self.english_decades = re.compile(r"\b(?:[12]\d)?\d0['’]?s\b") self.fraction = re.compile(r'(?<!\w)\d+/\d+(?![\d/])') self.amount = re.compile(r'(?<!\w)(?:\d+[\d,.]*-)(?!\w)') self.semester = re.compile(r'(?<!\w)(?P<a_semester>[WS]S|SoSe|WiSe)(?P<b_jahr>\d\d(?:/\d\d)?)(?!\w)', re.IGNORECASE) self.measurement = re.compile(r'(?<!\w)(?P<a_amount>[−+-]?\d*[,.]?\d+) ?(?P<b_unit>(?:mm|cm|dm|m|km)(?:\^?[23])?|bit|cent|eur|f|ft|g|ghz|h|hz|kg|l|lb|min|ml|qm|s|sek)(?!\w)', re.IGNORECASE) # auch Web2.0 self.number_compound = re.compile(r'(?<!\w) (?:\d+-?[\p{L}@][\p{L}@-]* | [\p{L}@][\p{L}@-]*-?\d+(?:\.\d)?) (?!\w)', re.VERBOSE) self.number = re.compile(r"""(?<!\w|\d[.,]?) (?:[−+-]? # optional sign (?:\d* # optional digits before decimal point [.,])? # optional decimal point \d+ # digits (?:[eE][−+-]?\d+)? # optional exponent | \d{1,3}(?:[.]\d{3})+(?:,\d+)? # dot for thousands, comma for decimals: 1.999,95 | \d{1,3}(?:,\d{3})+(?:[.]\d+)? # comma for thousands, dot for decimals: 1,999.95 ) (?![.,]?\d)""", re.VERBOSE) self.ipv4 = re.compile(r"(?<!\w|\d[.,]?)(?:\d{1,3}[.]){3}\d{1,3}(?![.,]?\d)") self.section_number = re.compile(r"(?<!\w|\d[.,]?)(?:\d+[.])+\d+[.]?(?![.,]?\d)") # PUNCTUATION self.quest_exclam = re.compile(r"([!?]+)") # arrows self.space_right_arrow = re.compile(r'(-+)\s+(>)') self.space_left_arrow = re.compile(r'(<)\s+(-+)') self.arrow = re.compile(r'(-+>|<-+|[\u2190-\u21ff])') # parens self.paired_paren = re.compile(r'([(])(?!inn)([^()]*)([)])') self.paired_bracket = re.compile(r'(\[)([^][]*)(\])') self.paren = re.compile(r"""((?:(?<!\w) # no alphanumeric character [[{(] # opening paren (?=\w)) | # alphanumeric character (?:(?<=\w) # alphanumeric character []})] # closing paren (?!\w)) | # no alphanumeric character (?:(?<=\s) # space []})] # closing paren (?=\w)) | # alphanumeric character (?:(?<=\w-) # hyphen [)] # closing paren (?=\w))) # alphanumeric character """, re.VERBOSE) self.all_paren = re.compile(r"(?<=\s)[][(){}](?=\s)") self.de_slash = re.compile(r'(/+)(?!in(?:nen)?|en)') # English possessive and contracted forms self.en_trailing_apos = re.compile(r"(?<!..in|')(['’])(?!\w)") self.en_dms = re.compile(r"(?<=\w)(['’][dms])\b", re.IGNORECASE) self.en_llreve = re.compile(r"(?<=\w)(['’](?:ll|re|ve))\b", re.IGNORECASE) self.en_not = re.compile(r"(?<=\w)(n['’]t)\b", re.IGNORECASE) en_twopart_contractions = [r"\b(a)(lot)\b", r"\b(gon)(na)\b", r"\b(got)(ta)\b", r"\b(lem)(me)\b", r"\b(out)(ta)\b", r"\b(wan)(na)\b", r"\b(c'm)(on)\b", r"\b(more)(['’]n)\b", r"\b(d['’])(ye)\b", r"(?<!\w)(['’]t)(is)\b", r"(?<!\w)(['’]t)(was)\b", r"\b(there)(s)\b", r"\b(i)(m)\b", r"\b(you)(re)\b", r"\b(he)(s)\b", r"\b(she)(s)\b", r"\b(ai)(nt)\b", r"\b(are)(nt)\b", r"\b(is)(nt)\b", r"\b(do)(nt)\b", r"\b(does)(nt)\b", r"\b(did)(nt)\b", r"\b(i)(ve)\b", r"\b(you)(ve)\b", r"\b(they)(ve)\b", r"\b(have)(nt)\b", r"\b(has)(nt)\b", r"\b(can)(not)\b", r"\b(ca)(nt)\b", r"\b(could)(nt)\b", r"\b(wo)(nt)\b", r"\b(would)(nt)\b", r"\b(you)(ll)\b", r"\b(let)(s)\b"] en_threepart_contractions = [r"\b(du)(n)(no)\b", r"\b(wha)(dd)(ya)\b", r"\b(wha)(t)(cha)\b", r"\b(i)('m)(a)\b"] # w/o, w/out, b/c, b/t, l/c, w/, d/c, u/s self.en_slash_words = re.compile(r"\b(?:w/o|w/out|b/t|l/c|b/c|d/c|u/s)\b|\bw/(?!\w)", re.IGNORECASE) # word--word self.en_double_hyphen = re.compile(r"(?<=\w)--+(?=\w)") self.en_twopart_contractions = [re.compile(contr, re.IGNORECASE) for contr in en_twopart_contractions] self.en_threepart_contractions = [re.compile(contr, re.IGNORECASE) for contr in en_threepart_contractions] # English hyphenated words if self.language == "en": nonbreaking_prefixes = utils.read_abbreviation_file("non-breaking_prefixes_%s.txt" % self.language) nonbreaking_suffixes = utils.read_abbreviation_file("non-breaking_suffixes_%s.txt" % self.language) nonbreaking_words = utils.read_abbreviation_file("non-breaking_hyphenated_words_%s.txt" % self.language) self.en_nonbreaking_prefixes = re.compile(r"(?<![\w-])(?:" + r'|'.join([re.escape(_) for _ in nonbreaking_prefixes]) + r")-[\w-]+", re.IGNORECASE) self.en_nonbreaking_suffixes = re.compile(r"\b[\w-]+-(?:" + r'|'.join([re.escape(_) for _ in nonbreaking_suffixes]) + r")(?![\w-])", re.IGNORECASE) self.en_nonbreaking_words = re.compile(r"\b(?:" + r'|'.join([re.escape(_) for _ in nonbreaking_words]) + r")\b", re.IGNORECASE) self.hyphen = re.compile(r"(?<=\w)(-)(?=\w)") self.en_no = re.compile(r"\b(no\.)\s*(?=\d)", re.IGNORECASE) self.en_degree = re.compile(r"(?<=\d ?)°(?:F|C|Oe)\b", re.IGNORECASE) # quotation marks # L'Enfer, d'accord, O'Connor self.letter_apostrophe_word = re.compile(r"\b([dlo]['’]\p{L}+)\b", re.IGNORECASE) self.paired_double_latex_quote = re.compile(r"(?<!`)(``)([^`']+)('')(?!')") self.paired_single_latex_quote = re.compile(r"(?<!`)(`)([^`']+)(')(?!')") self.paired_single_quot_mark = re.compile(r"(['‚‘’])([^']+)(['‘’])") self.all_quote = re.compile(r"(?<=\s)(?:``|''|`|['‚‘’])(?=\s)") self.other_punctuation = re.compile(r'([#<>%‰€$£₤¥°@~*„“”‚‘"»«›‹,;:+×÷±≤≥=&–—])') self.en_quotation_marks = re.compile(r'([„“”‚‘’"»«›‹])') self.en_other_punctuation = re.compile(r'([#<>%‰€$£₤¥°@~*,;:+×÷±≤≥=&/–—-]+)') self.ellipsis = re.compile(r'\.{2,}|…+(?:\.{2,})?') self.dot_without_space = re.compile(r'(?<=\p{Ll}{2})(\.)(?=\p{Lu}\p{Ll}{2})') # self.dot = re.compile(r'(?<=[\w)])(\.)(?![\w])') self.dot = re.compile(r'(\.)')
def __init__(self, split_camel_case=False, token_classes=False, extra_info=False): """Create a Tokenizer object. If split_camel_case is set to True, tokens written in CamelCase will be split. If token_classes is set to true, the tokenizer will output the token class for each token (if it is a number, an XML tag, an abbreviation, etc.). If extra_info is set to True, the tokenizer will output information about the original spelling of the tokens. """ self.split_camel_case = split_camel_case self.token_classes = token_classes self.extra_info = extra_info self.unique_string_length = 7 self.mapping = {} self.unique_prefix = None self.replacement_counter = 0 self.spaces = re.compile(r"\s+") self.controls = re.compile(r"[\u0000-\u001F\u007F-\u009F]") # soft hyphen (00AD), zero-width space (200B) self.other_nasties = re.compile(r"[\u00AD\u200B]") # combination self.starts_with_junk = re.compile( r"^[\u0000-\u001F\u007F-\u009F\u00AD\u200B]+") self.junk_between_spaces = re.compile( r"(?:^|\s+)[\s\u0000-\u001F\u007F-\u009F\u00AD\u200B]+(?:\s+|$)") # TAGS, EMAILS, URLs # self.tag = re.compile(r'<(?!-)(?:/[^> ]+|[^>]+/?)(?<!-)>') # taken from Regular Expressions Cookbook self.tag = re.compile( r""" < (?: # Branch for opening tags: ([_:A-Z][-.:\w]*) # Capture the opening tag name to backreference 1 (?: # This group permits zero or more attributes \s+ # Whitespace to separate attributes [_:A-Z][-.:\w]* # Attribute name \s*=\s* # Attribute name-value delimiter (?: "[^"”“]*" # Double-quoted attribute value | '[^']*' # Single-quoted attribute value ) )* \s* # Permit trailing whitespace /? # Permit self-closed tags | # Branch for closing tags: / ([_:A-Z][-.:\w]*) # Capture the closing tag name to backreference 2 \s* # Permit trailing whitespace ) > """, re.VERBOSE | re.IGNORECASE) # regex for email addresses taken from: # http://www.regular-expressions.info/email.html # self.email = re.compile(r"\b[[:alnum:].%+-]+@[[:alnum:].-]+\.[[:alpha:]]{2,}\b") self.email = re.compile( r"\b[[:alnum:].%+-]+(?:@| \[?at\]? )[[:alnum:].-]+(?:\.| \[?dot\]? )[[:alpha:]]{2,}\b" ) # simple regex for urls that start with http or www # TODO: schließende Klammer am Ende erlauben, wenn nach http etc. eine öffnende kam self.simple_url_with_brackets = re.compile( r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)\S+?\(\S*?\)\S*(?=$|[\'. "!?,;\n\t])', re.IGNORECASE) self.simple_url = re.compile( r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)\S+[^\'. "!?,;:\n\t]', re.IGNORECASE) self.doi = re.compile(r'\bdoi:10\.\d+/\S+', re.IGNORECASE) self.doi_with_space = re.compile(r'(?<=\bdoi: )10\.\d+/\S+', re.IGNORECASE) # we also allow things like tagesschau.de-App self.url_without_protocol = re.compile( r'\b[\w./-]+\.(?:de|com|org|net|edu|info|jpg|png|gif|log|txt)(?:-\w+)?\b', re.IGNORECASE) # XML entities self.entity_name = re.compile(r'&(?:quot|amp|apos|lt|gt);', re.IGNORECASE) self.entity_decimal = re.compile(r'&#\d+;') self.entity_hex = re.compile(r'&#x[0-9a-f]+;', re.IGNORECASE) # EMOTICONS # TODO: Peter, SMS von gestern Nacht -> hauptsächlich entities -> hilft nicht so wahnsinnig. emoticon_set = set([ "(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:", "(-:", ")=", ")o:", ")x", ":'C", ":/", ":<", ":C", ":[", "=(", "=)", "=D", "=P", ">:", "D':", "D:", "\:", "]:", "x(", "^^", "o.O", "oO", "\O/", "\m/", ":;))", "_))", "*_*", "._.", ":wink:", ">_<", "*<:-)", ":!:", ":;-))" ]) emoticon_list = sorted(emoticon_set, key=len, reverse=True) self.emoticon = re.compile( r"""(?:(?:[:;]|(?<!\d)8) # a variety of eyes, alt.: [:;8] [-'oO]? # optional nose or tear (?: \)+ | \(+ | [*] | ([DPp])\1*(?!\w))) # a variety of mouths """ + r"|" + r"(?:xD+|XD+)" + r"|" + r"|".join([re.escape(_) for _ in emoticon_list]), re.VERBOSE) self.space_emoticon = re.compile(r'([:;])[ ]+([()])') # ^3 is an emoticon, unless it is preceded by a number (with # optional whitespace between number and ^3) # ^\^3 # beginning of line, no leading characters # ^\D^3 # beginning of line, one leading character # (?<=\D[ ])^3 # two leading characters, non-number + space # (?<=.[^\d ])^3 # two leading characters, x + non-space-non-number self.heart_emoticon = re.compile( r"(?:^|^\D|(?<=\D[ ])|(?<=.[^\d ]))\^3") # U+2600..U+26FF Miscellaneous Symbols # U+2700..U+27BF Dingbats # U+1F300..U+1F5FF Miscellaneous Symbols and Pictographs # U+1F600..U+1F64F Emoticons # U+1F680..U+1F6FF Transport and Map Symbols # U+1F900..U+1F9FF Supplemental Symbols and Pictographs # self.unicode_symbols = re.compile(r"[\u2600-\u27BF]|[\u1F300-\u1F64F]|[\u1F680-\u1F6FF]|[\u1F900-\u1F9FF]") self.unicode_symbols = re.compile( r"[\u2600-\u27BF\U0001F300-\U0001f64f\U0001F680-\U0001F6FF\U0001F900-\U0001F9FF]" ) # special tokens containing + or & tokens_with_plus_or_ampersand = utils.read_abbreviation_file( "tokens_with_plus_or_ampersand.txt") plus_amp_simple = [(pa, re.search(r"^\w+[&+]\w+$", pa)) for pa in tokens_with_plus_or_ampersand] self.simple_plus_ampersand = set( [pa[0].lower() for pa in plus_amp_simple if pa[1]]) self.simple_plus_ampersand_candidates = re.compile(r"\b\w+[&+]\w+\b") tokens_with_plus_or_ampersand = [ pa[0] for pa in plus_amp_simple if not pa[1] ] # self.token_with_plus_ampersand = re.compile(r"(?<!\w)(?:\L<patokens>)(?!\w)", re.IGNORECASE, patokens=tokens_with_plus_or_ampersand) self.token_with_plus_ampersand = re.compile( r"(?<!\w)(?:" + r"|".join([re.escape(_) for _ in tokens_with_plus_or_ampersand]) + r")(?!\w)", re.IGNORECASE) # camelCase self.emoji = re.compile(r'\bemojiQ[[:alpha:]]{3,}\b') camel_case_token_list = utils.read_abbreviation_file( "camel_case_tokens.txt") cc_alnum = [(cc, re.search(r"^\w+$", cc)) for cc in camel_case_token_list] self.simple_camel_case_tokens = set( [cc[0] for cc in cc_alnum if cc[1]]) self.simple_camel_case_candidates = re.compile( r"\b\w*[[:lower:]][[:upper:]]\w*\b") camel_case_token_list = [cc[0] for cc in cc_alnum if not cc[1]] # things like ImmobilienScout24.de are already covered by URL detection # self.camel_case_url = re.compile(r'\b(?:[[:upper:]][[:lower:][:digit:]]+){2,}\.(?:de|com|org|net|edu)\b') self.camel_case_token = re.compile( r"\b(?:" + r"|".join([re.escape(_) for _ in camel_case_token_list]) + r"|:Mac[[:upper:]][[:lower:]]*)\b") # self.camel_case_token = re.compile(r"\b(?:\L<cctokens>|Mac[[:upper:]][[:lower:]]*)\b", cctokens=camel_case_token_set) self.in_and_innen = re.compile( r'\b[[:alpha:]]+[[:lower:]]In(?:nen)?[[:lower:]]*\b') self.camel_case = re.compile( r'(?<=[[:lower:]]{2})([[:upper:]])(?![[:upper:]]|\b)') # ABBREVIATIONS self.single_letter_ellipsis = re.compile( r"(?<![\w.])(?P<a_letter>[[:alpha:]])(?P<b_ellipsis>\.{3})(?!\.)") self.and_cetera = re.compile(r"(?<![\w.&])&c\.(?![[:alpha:]]{1,3}\.)") self.str_abbreviations = re.compile( r'(?<![\w.])([[:alpha:]-]+-Str\.)(?![[:alpha:]])', re.IGNORECASE) self.nr_abbreviations = re.compile( r"(?<![\w.])(\w+\.-?Nr\.)(?![[:alpha:]]{1,3}\.)", re.IGNORECASE) self.single_letter_abbreviation = re.compile( r"(?<![\w.])[[:alpha:]]\.(?![[:alpha:]]{1,3}\.)") # abbreviations with multiple dots that constitute tokens single_token_abbreviation_list = utils.read_abbreviation_file( "single_token_abbreviations.txt") self.single_token_abbreviation = re.compile( r"(?<![\w.])(?:" + r'|'.join([re.escape(_) for _ in single_token_abbreviation_list]) + r')(?![[:alpha:]]{1,3}\.)') self.ps = re.compile(r"(?<!\d[ ])\bps\.", re.IGNORECASE) self.multipart_abbreviation = re.compile(r'(?:[[:alpha:]]+\.){2,}') # only abbreviations that are not matched by (?:[[:alpha:]]\.)+ abbreviation_list = utils.read_abbreviation_file("abbreviations.txt") # abbrev_simple = [(a, re.search(r"^[[:alpha:]]{2,}\.$", a)) for a in abbreviation_list] # self.simple_abbreviations = set([a[0].lower() for a in abbrev_simple if a[1]]) # self.simple_abbreviation_candidates = re.compile(r"(?<![\w.])[[:alpha:]]{2,}\.(?![[:alpha:]]{1,3}\.)") # abbreviation_list = [a[0] for a in abbrev_simple if not a[1]] self.abbreviation = re.compile( r"(?<![\w.])(?:" + r"(?:(?:[[:alpha:]]\.){2,})" + r"|" + # r"(?i:" + # this part should be case insensitive r'|'.join([re.escape(_) for _ in abbreviation_list]) + # r"))+(?![[:alpha:]]{1,3}\.)", re.V1) r")+(?![[:alpha:]]{1,3}\.)", re.IGNORECASE) # MENTIONS, HASHTAGS, ACTION WORDS self.mention = re.compile(r'[@]\w+(?!\w)') self.hashtag = re.compile(r'(?<!\w)[#]\w+(?!\w)') # action words without spaces are to be treated as units self.action_word = re.compile( r'(?<!\w)(?P<a_open>[*+])(?P<b_middle>[^\s*]+)(?P<c_close>[*])(?!\w)' ) # DATE, TIME, NUMBERS self.three_part_date_year_first = re.compile( r'(?<![\d.]) (?P<a_year>\d{4}) (?P<b_month_or_day>([/-])\d{1,2}) (?P<c_day_or_month>\3\d{1,2}) (?![\d.])', re.VERBOSE) self.three_part_date_dmy = re.compile( r'(?<![\d.]) (?P<a_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])([./-])) (?P<b_month>(?:0?[1-9]|1[0-2])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])', re.VERBOSE) self.three_part_date_mdy = re.compile( r'(?<![\d.]) (?P<a_month>(?:0?[1-9]|1[0-2])([./-])) (?P<b_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])', re.VERBOSE) self.two_part_date = re.compile( r'(?<![\d.]) (?P<a_day_or_month>\d{1,2}([./-])) (?P<b_day_or_month>\d{1,2}\2) (?![\d.])', re.VERBOSE) self.time = re.compile(r'(?<!\w)\d{1,2}(?::\d{2}){1,2}(?![\d:])') self.ordinal = re.compile( r'(?<![\w.])(?:\d{1,3}|\d{5,}|[3-9]\d{3})\.(?!\d)') self.fraction = re.compile(r'(?<!\w)\d+/\d+(?![\d/])') self.amount = re.compile(r'(?<!\w)(?:\d+[\d,.]*-)(?!\w)') self.semester = re.compile( r'(?<!\w)(?P<a_semester>[WS]S|SoSe|WiSe)(?P<b_jahr>\d\d(?:/\d\d)?)(?!\w)', re.IGNORECASE) self.measurement = re.compile( r'(?<!\w)(?P<a_amount>[−+-]?\d*[,.]?\d+)(?P<b_unit>(?:mm|cm|dm|m|km)(?:\^?[23])?|qm|g|kg|min|h|s|sek|cent|eur)(?!\w)', re.IGNORECASE) # auch Web2.0 self.number_compound = re.compile( r'(?<!\w) (?:\d+-?[[:alpha:]@]+ | [[:alpha:]@]+-?\d+(?:\.\d)?) (?!\w)', re.VERBOSE) self.number = re.compile( r"""(?<!\w) (?:[−+-]? # optional sign \d* # optional digits before decimal point [.,]? # optional decimal point \d+ # digits (?:[eE][−+-]?\d+)? # optional exponent | \d+[\d.,]*\d+) (?![.,]?\d)""", re.VERBOSE) # PUNCTUATION self.quest_exclam = re.compile(r"([!?]+)") # arrows self.space_right_arrow = re.compile(r'(-+)\s+(>)') self.space_left_arrow = re.compile(r'(<)\s+(-+)') self.arrow = re.compile(r'(-+>|<-+|[\u2190-\u21ff])') # parens self.paired_paren = re.compile(r'([(])(?!inn)([^()]*)([)])') self.paired_bracket = re.compile(r'(\[)([^][]*)(\])') self.paren = re.compile( r"""((?:(?<!\w) # no alphanumeric character [[{(] # opening paren (?=\w)) | # alphanumeric character (?:(?<=\w) # alphanumeric character []})] # closing paren (?!\w)) | # no alphanumeric character (?:(?<=\s) # space []})] # closing paren (?=\w)) | # alphanumeric character (?:(?<=\w-) # hyphen [)] # closing paren (?=\w))) # alphanumeric character """, re.VERBOSE) self.all_paren = re.compile(r"(?<=\s)[][(){}](?=\s)") self.slash = re.compile(r'(/+)(?!in(?:nen)?|en)') self.paired_double_latex_quote = re.compile( r"(?<!`)(``)([^`']+)('')(?!')") self.paired_single_latex_quote = re.compile( r"(?<!`)(`)([^`']+)(')(?!')") self.paired_single_quot_mark = re.compile(r"(['‚‘’])([^']+)(['‘’])") self.all_quote = re.compile(r"(?<=\s)(?:``|''|`|['‚‘’])(?=\s)") self.other_punctuation = re.compile( r'([<>%‰€$£₤¥°@~*„“”‚‘"»«›‹,;:+=&–])') self.ellipsis = re.compile(r'\.{2,}|…+(?:\.{2,})?') self.dot_without_space = re.compile( r'(?<=[[:lower:]]{2})(\.)(?=[[:upper:]][[:lower:]]{2})') # self.dot = re.compile(r'(?<=[\w)])(\.)(?![\w])') self.dot = re.compile(r'(\.)')