def wordRegex(): """ #I'm including the code to create the regex, which makes it more readable. Note that this uses *unicode*: among other things, that means that it needs to be passed a unicode-decoded string: and that we have to use the "regex" module instead of the "re" module. Python3 will make this, perhaps, easier. (See how it says import regex as re up there? Yikes.) """ fp = os.path.realpath(__file__) fp = os.path.dirname(fp) fp = os.path.dirname(fp) cnf = os.path.join(fp, 'bookworm.cnf') with open(cnf) as ff: for line in ff: if 'database' in line: bwname = line.split('database = ')[-1] if '_phonemes' in bwname: print('Tokenizing text using the PHONEME regex') bigregex = re.compile(r'\b\w*[^\s]', re.UNICODE|re.IGNORECASE) else: print('Tokenizing text using the WORD regex') MasterExpression = ur"\p{L}+" possessive = MasterExpression + ur"'s" numbers = r"(?:[\$])?\d+" decimals = numbers + r"\.\d+" abbreviation = r"(?:mr|ms|mrs|dr|prof|rev|rep|sen|st|sr|jr|ft|gen|adm|lt|col|etc)\." sharps = r"[a-gjxA-GJX]#" punctuators = r"[^\p{L}\p{Z}]" """ Note: this compiles looking for the most complicated words first, and as it goes on finds simpler and simpler forms """ bigregex = re.compile("|".join([decimals,possessive,numbers,abbreviation,sharps,punctuators,MasterExpression]),re.UNICODE|re.IGNORECASE) return bigregex
def fetch_post_id_and_site_from_url(url): if url is None: return None post_type_regex = r"\/\d+#\d+$" post_type = "" search_regex = "" if regex.compile(post_type_regex).search(url): post_type = "answer" search_regex = r"^(?:https?:)?\/\/([\w.]+)/questions/\d+/.+/(\d+)#\d+$" else: post_type = "question" search_regex = r"^(?:https?:)?\/\/([\w.]+)/questions/(\d+)(?:/.*)?$" found = regex.compile(search_regex).search(url) if found is not None: try: post_id = found.group(2) post_site = found.group(1) return (post_id, post_site, post_type) except: return None search_regex = r"^(?:https?:)?\/\/([\w.]+)/(q|a)/(\d+)(?:/\d+)?/?" found = regex.compile(search_regex).search(url) if found is None: return None try: post_id = found.group(3) post_site = found.group(1) post_type = "question" if found.group(2) == "q" else "answer" return (post_id, post_site, post_type) except: return None
def main(in_file, out_file, arg_i, arg_v ): global module_dict, import_dict, qname_dict, fixup_dict indent_re = regex.compile('^\s*(#*)') qnames_re = regex.compile('Q\w+') # If we will do suggested imports, load all the classnames # in the module_dict. This may take some little while! if arg_i : load_module_dict() line = in_file.readline() while line: out_lines = [] indent_m = indent_re.match(line) skip = 0 < len(indent_m.group(1)) # is a comment or skip |= line.startswith('import') # ..an import or skip |= (line.startswith('from') and (0 > line.find('__future')) ) if not skip : # set up appropriate indent for comments indent = indent_m.group(0) + '#! ' # Note all the QXxxx names in this line qnames_in_line = qnames_re.findall(line) if arg_i : # Add each to the set of its module for qname in qnames_in_line : if qname in module_dict : # only valid ones import_dict[module_dict[qname]].add(qname) # Check the QXxxx names for the troublesome ones. for qname in qnames_in_line : if qname in qname_dict : out_lines += qname_dict[qname](line) # Run through the non-QXxxx indicator strings. for (indicator, fixup) in fixup_dict.items() : if indicator in line : out_lines += fixup(line) # Write any annotation comments. for comment in out_lines: out_file.write( indent + comment + '\n') #endif skip out_file.write( line ) line = in_file.readline() # end while line if arg_i : out_file.write('\n\n#! Suggested import statements for class-names seen above.\n') out_file.write('#! You must move these to the top of the file replacing any\n') out_file.write('#! existing PyQt4 import statements.\n') for (mod_name, class_set) in import_dict.items() : if len(class_set) : # not an empty set out_file.write('from PyQt5.{0} import\n ('.format(mod_name)) join_string = ',\n ' if arg_v else ', ' out_file.write(join_string.join(sorted(class_set))) out_file.write(')\n')
def __init__(self, start=None, end=None, void=None, structs=None): self.start = start if start else re.compile(r"<(\w+).*?(?<!/)>") self.end = end if end else re.compile(r"</(\w+)>") self.void = void if void else re.compile(r"<(\w+).*?/>") self.stags = set() self.etags = set() self.vtags = set()
def readConfigFile ( source # pathname to config file to read ): # Purpose: read the configuration file at 'source', parse it, # store values in a dictionary # Returns: the dictionary parsed from 'source' # Assumes: 'source' exists # Effects: reads from the file system # Throws: IOError if there are problems reading fp = open (source, 'r') lines = fp.readlines () fp.close () ignore_line = regex.compile ('[ \t]*#') # comment line data_line = regex.compile ('[ \t]*' '\([^ \t]+\)' '[ \t]*\(.*\)') dict = {} for line in lines: if ignore_line.match (line) == -1: if data_line.match (line) != -1: (parameter, value) = data_line.group (1,2) dict [string.upper (parameter)] = value return dict
def build_check_whitespace_timestamp(t): CR_RE = re.compile(r'\r') TRAILING_WHITESPACE_RE = re.compile(r'\s+\n\Z') NO_NEWLINE_RE = re.compile(r'[^\n]\Z') ALL_WHITESPACE_RE = re.compile(r'\s+\Z') errors = 0 for filename in sorted(t.newer(t.dependencies)): whitespace = False for lineno, line in enumerate(open(filename)): if CR_RE.search(line): t.info('%s:%d: carriage return character in line', filename, lineno + 1) errors += 1 if TRAILING_WHITESPACE_RE.search(line): t.info('%s:%d: trailing whitespace', filename, lineno + 1) errors += 1 if NO_NEWLINE_RE.search(line): t.info('%s:%d: no newline at end of file', filename, lineno + 1) errors += 1 whitespace = ALL_WHITESPACE_RE.match(line) if whitespace: t.info('%s: trailing whitespace at end of file', filename) errors += 1 if errors: t.error('%d whitespace errors' % (errors,)) t.touch()
def clean_line(line): line = strip_nikkud(line) replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right? line = multiple_replace(line, replace_dict, using_regex=True) # line = re.sub(u'[:\?]', '', line) # line = re.sub(u'”', u'"', line) reg_parentheses = re.compile(u'\((.*?)\)') reg_brackets = re.compile(u'\[(.*?)\]') in_per = reg_parentheses.search(line) in_bra = reg_brackets.search(line) reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''') reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''') line = re.sub(u'\[.*?אלפס.*?\]', u'', line) line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line) f_ayyen = re.search(reg_ayyen_tur, line) f_lo_manu = re.search(reg_lo_manu, line) if f_ayyen: line = line[:f_ayyen.start()] if f_lo_manu: line = re.sub(f_lo_manu.group('a'), u"", line) if in_per: if in_bra: clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct clean = re.sub(reg_parentheses, '', clean) else: clean = re.sub(reg_parentheses, ur'\1', line) elif in_bra: clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct else: clean = line return clean
def _prep_cleanup_re(): symbols = [] digits = [] white = [] for c in range(0x10FFFF): x = chr(c) cat = unicodedata.category(x) # All punctuation, symbols, and whitespace, C0 and C1 controls, # and "format effectors" (e.g. ZWNJ, RLE). Cn (unassigned), # Cs (surrogate), and Co (private use) are not stripped. if cat[0] in ('P', 'S'): # Don't strip leading and trailing hyphens and apostrophes. # FIXME: this really ought to be an exhaustive list of P- # and S-class characters that can be *part of* a word. if x in ('-', '‐', '\'', '’'): continue # These characters need to be escaped inside a character class. # '-' is not included because the preceding 'if' removed it. if (x in '\\', '[', ']'): symbols.append('\\' + x) else: symbols.append(x) elif cat[0] == 'N': digits.append(x) elif cat[0] == 'Z' or cat in ('Cc', 'Cf'): white.append(x) symbols = "".join(symbols) digits = "".join(digits) white = "".join(white) return ( re.compile("^[" + symbols + white + "]+"), re.compile("[" + symbols + white + "]+$"), re.compile("^[" + symbols + white + digits + "'’\\-‐" + "]+$"), re.compile("[" + symbols + digits + "]+") )
def updateline(file, key, value, casefold = 1): try: f = open(file, 'r') lines = f.readlines() f.close() except IOError: lines = [] pat = key + ':\(.*\)\n' if casefold: prog = regex.compile(pat, regex.casefold) else: prog = regex.compile(pat) if value is None: newline = None else: newline = '%s: %s' % (key, value) for i in range(len(lines)): line = lines[i] if prog.match(line) == len(line): if newline is None: del lines[i] else: lines[i] = newline break else: if newline is not None: lines.append(newline) f = open(tempfile, 'w') for line in lines: f.write(line) f.close()
def __load_txt(self): rn1 = r"(?P<authors>((\pL\. ?(\pL\. )?\pL+,? )|(\pL+ \pL\. ?(\pL\.)?,? )" #regular for authors rn2 = r"|(\p{Lu}\p{Ll}+ \p{Lu}\p{Ll}+,? )" rn3 = r")+)" ra_ru = r"(?P<article>\p{Lu}\p{Ll}+ \p{Ll}+.*?) *\/\/ *" #regular for article ra_eng = r"(?P<article>\p{Lu}.*?) *\/\/ *" #regular for article rj = r'(?P<source>[ \pL"“”]+)' #regular for source rm = r"(?P<misc>.+)" #regular for misc reg_ru = re.compile(rn1+rn2+rn3+ra_ru+rj+rm, re.UNICODE) reg_eng = re.compile(rn1+rn3+ra_eng+rj+rm, re.UNICODE) data = [] f = open(self.filename, 'r') content = f.read() items = content.split('\n') for item in items: res = None if isEnglish(item[:15]): res = reg_eng.match(item.strip()) else: res = reg_ru.match(item.strip()) if res != None: publication = Publication() publication.authors = Author.parseAuthors(res.group("authors")) data.append({"authors": split_authors(res.group("authors")), "article": res.group("article"), "source": res.group("source"), "misc": res.group("misc")}) else: print("Wrong line: " + item) return data
def sample1(filename, aft=None, fore=None, top=None, home=None): doc = SeriesDocument('HTMLgen.rc') doc.goprev,doc.gonext,doc.gotop,doc.gohome = aft,fore,top,home doc.background = '../image/texturec.jpg' doc.banner = ('../image/historic.gif', 472, 60) doc.author = '1776 Thomas Jefferson' doc.email = '*****@*****.**' doc.logo = ('../image/eagle21.gif', 64, 54) # parse Declaration of Independence re_hline = regex.compile('^--+$') re_title = regex.compile('^Title:\(.*$\)') font2 = Font(size='+2') s = open(os.path.join(datadir, 'DoI.txt')).read() paragraphs = regsub.split(s, '\n\([\t ]*\n\)+') for para in paragraphs: if not para: continue if re_title.search(para) > -1: doc.title = re_title.group(1) elif re_hline.search(para) > -1: doc.append(HR()) else: p = Paragraph( para ) # using \` to match beginning of paragraph # ^ won't work because it'll match all the newlines n = p.markup('\`\(\w\)', font2, reg_type='regex') doc.append(p) doc.write(os.path.join(htmldir, filename))
def compileRegex(string, flags): try: return regex.compile(string, convertRegex(flags)) except: for od in HEXADECIMAL_PATTERNS: string = string.replace(od[0], od[1]) return regex.compile(string, convertRegex(flags))
def tlg_plaintext_cleanup(text, rm_punctuation=False, rm_periods=False): """Remove and substitute post-processing for Greek TLG text. TODO: Surely more junk to pull out. Please submit bugs! TODO: {.+?}|\(.+?\) working? TODO: This is a rather slow now, help in speeding up welcome. """ remove_comp = regex.compile(r'-\n|«|»|<|>|\.\.\.|‘|’|_|{.+?}|\(.+?\)|[a-zA-Z0-9]', flags=regex.VERSION1) text = remove_comp.sub('', text) new_text = None if rm_punctuation: new_text = '' punctuation = [',', '·', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}'] if rm_periods: punctuation += ['.', ';'] for char in text: # second try at rming some punctuation; merge with above regex if char in punctuation: pass else: new_text += char if new_text: text = new_text # replace line breaks w/ space replace_comp = regex.compile(r'\n') text = replace_comp.sub(' ', text) comp_space = regex.compile(r'\s+') text = comp_space.sub(' ', text) return text
def expand_parens(str, include_spaces=False): output = [] if "‣" in str: for i in str.split("‣"): output.extend(expand_parens(i)) return output if include_spaces: regex1 = re.compile(r"(^.*)\((.+)\)(.*$)") regex2 = re.compile(r"(^.*)\((.+)\)(.*$)") else: regex1 = re.compile(r"(^.*[^ ])\(([^ ]+)\)(.*$)") regex2 = re.compile(r"(^.*)\(([^ ]+)\)([^ ].*$)") re_match1 = regex1.search(str) re_match2 = regex2.search(str) if re_match1: within = re_match1.group(1) + re_match1.group(2) + re_match1.group(3) without = re_match1.group(1) + re_match1.group(3) elif re_match2: within = re_match2.group(1) + re_match2.group(2) + re_match2.group(3) without = re_match2.group(1) + re_match2.group(3) else: return [str] output = [clean_str(without), clean_str(within)] return output
def test_post(title, body, user_name, site, is_answer, body_is_summary): result = [] for rule in FindSpam.rules: body_to_check = body if rule['stripcodeblocks']: body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL) body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL) if rule['all'] != (site in rule['sites']): matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title) matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name) matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check) if matched_title and rule['title']: try: if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title): result.append(rule['reason']) except KeyError: # There is no special logic for this rule result.append(rule['reason'].replace("{}", "title")) if matched_username and rule['username']: try: if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username): result.append(rule['reason']) except KeyError: # There is no special logic for this rule result.append(rule['reason'].replace("{}", "username")) if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']): type_of_post = "answer" if is_answer else "body" try: if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body): result.append(rule['reason'].replace("{}", type_of_post)) except KeyError: # There is no special logic for this rule result.append(rule['reason'].replace("{}", type_of_post)) return result
def rebuildregexes(self): """ rebuild a regex for priority will need a colored and a noncolored regex for each priority """ colorres = [] noncolorres = [] for trig in self.uniquelookup.values(): if trig['enabled']: if 'matchcolor' in trig \ and trig['matchcolor']: colorres.append("(?P<%s>%s)" % (trig['unique'], trig['nonamedgroups'])) else: noncolorres.append("(?P<%s>%s)" % (trig['unique'], trig['nonamedgroups'])) if colorres: try: self.regex['color'] = re.compile("|".join(colorres)) except re.error: self.api('send.traceback')('Could not compile color regex') else: self.regex['color'] = "" try: self.regex['noncolor'] = re.compile("|".join(noncolorres)) except re.error: self.api('send.traceback')('Could not compile regex')
def clean_line(line): line = strip_nikkud(line) line = re.sub(u':', '', line) reg_parentheses = re.compile(u'\((.*?)\)') reg_brackets = re.compile(u'\[(.*?)\]') in_per = reg_parentheses.search(line) in_bra = reg_brackets.search(line) reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''') line = re.sub(u'\[.*?אלפס.*?\]', u'', line) line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line) pos = re.search(reg_ayyen_tur, line) if pos: line = line[:pos.start()] if in_per: if in_bra: clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct clean = re.sub(reg_parentheses, '', clean) else: clean = re.sub(reg_parentheses, ur'\1', line) elif in_bra: clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct else: clean = line return clean
def get_user_from_list_command(cmd): # for example, !!/addblu is a list command cmd_merged_spaces = regex.sub("\\s+", " ", cmd) cmd_parts = cmd_merged_spaces.split(" ") uid = -1 site = "" if len(cmd_parts) == 1: uid_site = get_user_from_url(cmd_parts[0]) if uid_site is not None: uid, site = uid_site elif len(cmd_parts) == 2: uid = cmd_parts[0] site = cmd_parts[1] digit_re = regex.compile("^[0-9]+$") site_re = regex.compile(r"^(\w+\.stackexchange\.com|\w+\.(com|net))$") if not digit_re.match(uid): uid = -1 site = "" elif not site_re.match(site): exists, name = datahandling.check_site_and_get_full_name(site) if exists: return uid, name else: return -2, name return uid, site
def fetch_post_id_and_site_from_url(url): if url is None: return None trimmed_url = rebuild_str(url) post_type_regex = r"(?:\/\d+)?#\d+$" post_type = "" search_regex = "" if regex.compile(post_type_regex).search(trimmed_url): post_type = "answer" search_regex = r"^(?:https?:)?\/\/([\w.]+)\/questions\/\d+\/.+[/#](\d+)(?:#\d+)?$" else: post_type = "question" search_regex = r"^(?:https?:)?\/\/([\w.]+)/questions/(\d+)(?:/.*)?$" found = regex.compile(search_regex).search(trimmed_url) if found is not None: try: post_id = found.group(2) post_site = found.group(1) return (post_id, post_site, post_type) except IndexError: return None search_regex = r"^(?:https?:)?\/\/([\w.]+)/(q|a)/(\d+)(?:/\d+)?/?" found = regex.compile(search_regex).search(trimmed_url) if found is None: return None try: post_id = found.group(3) post_site = found.group(1) post_type = "question" if found.group(2) == "q" else "answer" return (post_id, post_site, post_type) except IndexError: return None
def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str]) -> None: """ Builds a codec converting between graphemes/code points and integer label sequences. charset may either be a string, a list or a dict. In the first case each code point will be assigned a label, in the second case each string in the list will be assigned a label, and in the final case each key string will be mapped to the value sequence of integers. In the first two cases labels will be assigned automatically. As 0 is the blank label in a CTC output layer, output labels and input dictionaries are/should be 1-indexed. Args: charset (unicode, list, dict): Input character set. """ if isinstance(charset, dict): self.c2l = charset else: self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)} # map integer labels to code points because regex only works with strings self.l2c = {} # type: Dict[str, str] for k, v in self.c2l.items(): self.l2c[''.join(chr(c) for c in v)] = k # sort prefixes for c2l regex self.c2l_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.c2l.keys(), key=len, reverse=True))) # sort prefixes for l2c regex self.l2c_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.l2c.keys(), key=len, reverse=True)))
def __init__(self, Normalizer, normalization = False): super(RegExp, self).__init__() self.normalization = normalization self.Normalizer = Normalizer self.matrices = { "primarySource" : { "matcher" : self.generate("primarySource", False), "grouper" : self.generate("primarySource") }, "secondarySource" : { "matcher" : self.generate("secondarySource", False), "grouper" : self.generate("secondarySource") }, "quotes" : { "matcher" : self.generate("quote", False), "grouper" : self.generate("quote") }, "senses" : { "grouper" : re.compile("^([1-9]{1,3}|[abcdefABCDEFαβγδ]{1}|IX|IV|V?I{0,3})$"), "splitter" : re.compile("[–\,]{0,1}\s([1-9]{1,3}|[abcdefABCDEFαβγδ]{1}|IX|IV|V?I{0,3})\)\s") }, "greek" : { "matcher" : self.generate("greek"), "grouper" : re.compile("(?P<match>(?:(?:[\p{Greek}µ']+)+[\s\.\,]*)+)") }, "firstLine" : { "grouper" : self.generate("firstLine") } }
def add_spaces(text, exclude=None): if exclude: patt_exclude = regex.escape(exclude) patt_eng_cjk = regex.compile(u"([[%s]--%s])([%s])" % (CHAR_ENG_LEFT, patt_exclude, CHAR_CJK)) patt_cjk_eng = regex.compile(u"([%s])([[%s]--%s])" % (CHAR_CJK, CHAR_ENG_RIGHT, patt_exclude)) else: patt_eng_cjk = PATTERN_ENG_CJK patt_cjk_eng = PATTERN_CJK_ENG def add_space_func(index1, index2): def add_space(match): return u"%s %s" % (match.group(index1), match.group(index2)) return add_space text = patt_cjk_eng.subn(add_space_func(1, 2), text)[0] text = patt_eng_cjk.subn(add_space_func(1, 2), text)[0] if not (exclude and '"' in exclude): # XXX"YYY"XXX -> XXX "YYY" XXX # where X and Y are CJK charaters is_left_dquote = True is_left_squote = True out = StringIO.StringIO() for i in xrange(len(text)): prev_char = text[i - 1] if i > 0 else None cur_char = text[i] next_char = text[i + 1] if i < len(text) - 1 else None if cur_char == u'"': if is_left_dquote: if _is_cjk(prev_char): out.write(u' "') else: out.write(u'"') is_left_dquote = False else: if _is_cjk(next_char): out.write(u'" ') else: out.write(u'"') is_left_dquote = True elif cur_char == u"'": if is_left_squote: if _is_cjk(prev_char): out.write(u" '") else: out.write(u"'") is_left_squote = False else: if _is_cjk(next_char): out.write(u"' ") else: out.write(u"'") is_left_squote = True else: out.write(cur_char) text = out.getvalue() out.close() return text
def setliteral(self, tag): self.literal = 1 re = "%s%s[%s]*%s" % (ETAGO, tag, string.whitespace, TAGC) if self._normfunc is string.lower: self._lit_etag_re = regex.compile(re, regex.casefold) else: self._lit_etag_re = regex.compile(re)
def all_caps_text(s, site): s = regex.sub("<[^>]*>", "", s) # remove HTML tags s = regex.sub("&\w+;", "", s) # remove HTML entities if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s): return False, "" # common words in non-spam all-caps titles if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s): return True, "All in caps"
def _replace_for(self, text, nested_position, keyword_number=1): """ Finds and replace the % for: ... % endfor loops of the mail.template. It will create keyword records for each loop found. :param text: mail.template text :param nested_position: counts how nested if the current pass :param keyword_number: counts how many for we found :return: simplified text without the if code, keywords found """ # Regex for finding text wrapped in loops loop_regex = r'(% for .*?:$)(.*?)(% endfor)' ul_loop_regex = r'(?:<ul[^<]*?)(% for .*?:$)(.*?)(% endfor)(.*?</ul>)' # First scan for ul_loops for_pattern = re.compile(ul_loop_regex, flags=re.DOTALL | re.MULTILINE) simple_text, found_keywords = self._replace_for_type( text, nested_position, keyword_number, 'for_ul', for_pattern) keyword_number += len(found_keywords) # Then scan for regular loops for_pattern = re.compile(loop_regex, flags=re.DOTALL | re.MULTILINE) simple_text, keywords = self._replace_for_type( simple_text, nested_position, keyword_number, 'for', for_pattern) found_keywords |= keywords return simple_text, found_keywords
def _reload_allowed_list_file(self): '''(Re)loads the list with rules for non-segment borders, e.g stops the possible segment border being split (if not forced by a forcing rule specified for the stop rule. The stop rules are pairs of two rules of which the first is matched against the segment to the left, and the latter is matched against the segment to the right. The filename is given in the __init__, and the default file is "./data/stop_list". See the __init__() and segment() function for more about the algorithm. ATTENTION note that verbose regexps are used.''' with open(self._allowed_list_filename, 'r') as f: _filedata = f.readlines() self._allowed_regexps = list() _rule_left = '' _rule_right = '' for i in range(len(_filedata)): # rules must be specified in correct order: first left, then right if _filedata[i].startswith('LEFT:'): _rule_left = regex.compile(_filedata[i][5:], regex.VERBOSE) elif _filedata[i].startswith('RIGHT:'): _rule_right = regex.compile(_filedata[i][6:], regex.VERBOSE) self._allowed_regexps.append((_rule_left, _rule_right)) _rule_left = '' _rule_right = '' else: # everything else is ignored continue
def __init__(self, directory_name): self.directory = directory_name self.unigram_frequency = Counter() self.trigrams = dict() self.trigram_load_pattern = re2.compile(r'^([^ ]*) ([^ ]*) ([^\t]*)\t(\d*)') self.middle_token_pattern = re2.compile(r'^\p{posix_alnum}*$', re2.UNICODE) super(FileScorer, self).__init__()
def __init__(self, src, javaFlag=0): Doxy2SWIG.__init__(self, src, javaFlag) """ Turns on the title, brief description and detailed description markup. Turn them off when inside member documentatation. """ self.FilterTitle = True self.sitkClassName='' self.EmptyText = False # compiled regular expressions # common formula types in xml version of documentation self.dollarFormula = re.compile("^\\$(.+)\\$$") self.arrayFormula = re.compile("^\\\\\\[(.+)\\\\\\]$") # more complex formula layout, that breaks R documentation # checks. self.mathstuff1 = re.compile(r"\\begin\{array\}\{[^}]+\}") self.mathstuff2 = re.compile(r"\\begin\{array\}") self.mathstuff3 = re.compile(r"\\end\{array\}") # a complex recursive regular expression, to deal with formula # inside mbox and text structures self.mathstuff4 = regex.compile(r"\\mbox({((?>[^}{]*(?1)?)*)})", flags=regex.V1) self.mathstuff5 = regex.compile(r"\\text({((?>[^}{]*(?1)?)*)})", flags=regex.V1) # the special doxygen tags - note - not greedy self.mathstuff6 = re.compile(r"\\f\$(.+?)\\f\$") # alignment tags self.mathstuff7 = re.compile(r" & ")
def __init__(self): # These attributes are set by the parse method self.doc = None self.para = None self.current_string = None self.flow = None self.stateMachine = StateMachine() self.stateMachine.add_state("PARA", self._para) self.stateMachine.add_state("ESCAPE", self._escape) self.stateMachine.add_state("END", None, end_state=1) self.stateMachine.add_state("ANNOTATION-START", self._annotation_start) self.stateMachine.add_state("CITATION-START", self._citation_start) self.stateMachine.add_state("BOLD-START", self._bold_start) self.stateMachine.add_state("ITALIC-START", self._italic_start) self.stateMachine.add_state("CODE-START", self._code_start) self.stateMachine.add_state("QUOTES-START", self._quotes_start) self.stateMachine.add_state("INLINE-INSERT", self._inline_insert) self.stateMachine.add_state("CHARACTER-ENTITY", self._character_entity) self.stateMachine.set_start("PARA") self.patterns = { 'escape': re.compile(r'\\', re.U), 'escaped-chars': re.compile(r'[\\\(\{\}\[\]_\*,\.\*`"&]', re.U), 'annotation': re.compile( r'(?<!\\)\{(?P<text>.*?)(?<!\\)\}(\(\s*(?P<type>\S*?\s*[^\\"\']?)(["\'](?P<specifically>.*?)["\'])??\s*(\((?P<namespace>\w+)\))?\s*(~(?P<language>[\w-]+))?\))?', re.U), 'bold': re.compile(r'\*(?P<text>((?<=\\)\*|[^\*])*)(?<!\\)\*', re.U), 'italic': re.compile(r'_(?P<text>((?<=\\)_|[^_])*)(?<!\\)_', re.U), 'code': re.compile(r'`(?P<text>(``|[^`])*)`', re.U), 'quotes': re.compile(r'"(?P<text>((?<=\\)"|[^"])*)(?<!\\)"', re.U), 'inline-insert': re.compile(r'>\((?P<attributes>.*?)\)', re.U), 'character-entity': re.compile(r'&(\#[0-9]+|#[xX][0-9a-fA-F]+|[\w]+);'), 'citation': re.compile(r'(\[\s*\*(?P<id>\S+)(\s+(?P<id_extra>.+?))?\])|(\[\s*\#(?P<name_name>\S+)(\s+(?P<extra>.+?))?\])|(\[\s*(?P<citation>.*?)\])', re.U) }
def makeconfig(infp, outfp, modules, with_ifdef=0): m1 = regex.compile('-- ADDMODULE MARKER 1 --') m2 = regex.compile('-- ADDMODULE MARKER 2 --') while 1: line = infp.readline() if not line: break outfp.write(line) if m1 and m1.search(line) >= 0: m1 = None for mod in modules: if mod in never: continue if with_ifdef: outfp.write("#ifndef init%s\n"%mod) outfp.write('extern void init%s();\n' % mod) if with_ifdef: outfp.write("#endif\n") elif m2 and m2.search(line) >= 0: m2 = None for mod in modules: if mod in never: continue outfp.write('\t{"%s", init%s},\n' % (mod, mod)) if m1: sys.stderr.write('MARKER 1 never found\n') elif m2: sys.stderr.write('MARKER 2 never found\n')
def __init__(self): super().__init__() self.re_match = re.compile( r"""([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)""" )
class CommonDefinitionPatterns: reg_semicolon = re.compile("([\"'“„])(?:(?=(\\\\?))\\2.)*?\\1(?=:)", re.UNICODE | re.IGNORECASE) reg_quoted = re.compile("([\"'“„])(?:(?=(\\\\?))\\2.)*?\\1", re.UNICODE | re.IGNORECASE) reg_acronyms = re.compile(r"\(\p{Lu}\p{L}*\p{Lu}\)", re.UNICODE) @staticmethod def match_acronyms(phrase: str) -> List[PatternFound]: """ :param phrase: rompió el silencio tras ser despedido del Canal del Fútbol (CDF). :return: {name: 'CDF', probability: 100, ...} """ defs = [] for match in CommonDefinitionPatterns.reg_acronyms.finditer(phrase): acr_start = CommonDefinitionPatterns.get_acronym_words_start( phrase, match) if acr_start < 0: continue df = PatternFound() df.name = match.group().strip('() ') df.start = acr_start df.end = match.start() - 1 df.probability = 100 defs.append(df) return defs @staticmethod def get_acronym_words_start(phrase: str, match: Match) -> int: """ each acronym match should be preceded by capitalized words that start from the same letters :param phrase: "rompió el silencio tras ser despedido del Canal del Fútbol (CDF). " :param match: "(CDF)" Match object for this example :return: start letter (42 for this case) index or -1 """ proc = UniversalDefinitionsParser.basic_line_processor name = match.group().strip('() ').upper() start = match.start() words = proc.split_text_on_words(phrase[:start]) if len(words) < 2: return -1 mistakes = 0 uppercases = 0 acr_index = len(name) - 1 acr_start = words[-1].start for i in range(len(words) - 1, -1, -1): if words[i].is_separator: continue l = words[i].text[0] l_upper = l.upper() is_upper = l_upper == l if is_upper: uppercases += 1 is_correct = name[acr_index] == l_upper if not is_correct: mistakes += 1 if mistakes > 1: return -1 continue acr_start = words[i].start acr_index -= 1 if acr_index < 0: break return acr_start if uppercases > 1 and acr_index < 0 else -1 @staticmethod def match_es_def_by_semicolon(phrase: str) -> List[PatternFound]: """ :param phrase: "Modern anatomy human": a human of modern anatomy. :return: {name: 'Modern anatomy human', probability: 100, ...} """ prob = 100 defs = [] for match in CommonDefinitionPatterns.reg_semicolon.finditer(phrase): df = PatternFound() df.name = match.group() df.start = 0 df.end = len(phrase) df.probability = prob defs.append(df) prob = 66 return defs @staticmethod def peek_quoted_part(phrase: str, match: Match, start_func: Callable[[str, Match, Match], int], end_func: Callable[[str, Match, Match], int], match_prob: int) -> List[PatternFound]: """ :param phrase: the whole text, may be used for getting the definition's text length :param match: the matched part of the phrase that may contain several quote-packed definitions :param start_func: (phrase, match, quoted_match) -> definition's start :param end_func: (phrase, match, quoted_match) -> definition's end :param match_prob: definition's probability :return: a list of definitions found or an empty list """ defs = [] text = match.group() quoted_entries = [ m for m in CommonDefinitionPatterns.reg_quoted.finditer(text) ] if len(quoted_entries) == 0: return defs for entry in quoted_entries: df = PatternFound() df.name = entry.group() df.start = start_func(phrase, match, entry) df.end = end_func(phrase, match, entry) df.probability = match_prob defs.append(df) return defs @staticmethod def collect_regex_matches_with_quoted_chunks( phrase: str, reg: re, prob: int, quoted_def_start: Callable[[str, Match, Match], int], quoted_def_end: Callable[[str, Match, Match], int], def_start: Callable[[str, Match], int], def_end: Callable[[str, Match], int]) -> List[PatternFound]: """ First, find all matches by 'reg' ptr Second, go through matches For each match try to find a set of quoted words If found, use them as matches Or use the whole match :param quoted_def_start: (phrase, match, quoted_match) -> definition's start :param quoted_def_end: (phrase, match, quoted_match) -> definition's end :param def_start: (phrase, match) -> definition's start :param def_end: (phrase, match) -> definition's end :return: """ defs = [] for match in reg.finditer(phrase): quoted_matches = \ CommonDefinitionPatterns.peek_quoted_part(phrase, match, quoted_def_start, quoted_def_end, prob) if len(quoted_matches) > 0: defs += quoted_matches continue df = PatternFound() df.name = match.group() df.start = def_start(phrase, match) df.end = def_end(phrase, match) df.probability = prob defs.append(df) return defs @staticmethod def collect_regex_matches( phrase: str, reg: re, prob: int, def_start: Callable[[str, Match], int], def_end: Callable[[str, Match], int]) -> List[PatternFound]: """ find all matches by 'reg' ptr :param quoted_def_start: (phrase, match, quoted_match) -> definition's start :param quoted_def_end: (phrase, match, quoted_match) -> definition's end :param def_start: (phrase, match) -> definition's start :param def_end: (phrase, match) -> definition's end :return: """ defs = [] for match in reg.finditer(phrase): df = PatternFound() df.name = match.group() df.start = def_start(phrase, match) df.end = def_end(phrase, match) df.probability = prob defs.append(df) return defs
# -*- coding: utf-8 -*- """ Created on Mon May 20 16:41:00 2019 @author: richard.mitanchey """ from lxml import etree import regex as re import unicodedata import io import networkx as nx from collections import OrderedDict import json many = re.compile(r'\.\.\*') parser = etree.XMLParser() nsmap = { "uml": "http://www.omg.org/spec/UML/20110701", "xmi": "http://www.omg.org/spec/XMI/20110701", "thecustomprofile": "http://www.sparxsystems.com/profiles/thecustomprofile/1.0", "UML_Profile_for_INSPIRE_data_specifications": "http://www.sparxsystems.com/profiles/UML_Profile_for_INSPIRE_data_specifications/3.0-2" }
# -*- coding:utf-8 -*- import random, os import regex as re from unidecode import unidecode _punct_re = re.compile(r'[\t !":\!#$%&\'()*\-/<=>?@\[\\\]^_`{|},.]+') def create_chain(file_paths): markov_chain = {} word1 = "\n" word2 = "\n" for path in file_paths: with open(path) as file: for line in file: line = line.strip() for current_word in line.split(): if current_word != "": markov_chain.setdefault((word1, word2), []).append(current_word) word1 = word2 word2 = current_word return markov_chain def construct_sentence(markov_chain, word_count=5, slug=False): generated_sentence = "" word_tuple = random.choice(list(markov_chain.keys())) w1 = word_tuple[0] w2 = word_tuple[1] for i in range(word_count): newword = random.choice(markov_chain[(w1, w2)])
def compile_infix_regex(entries): expression = '|'.join([piece for piece in entries if piece.strip()]) return re.compile(expression)
def __init__(self): super().__init__() self.re_match = re.compile(r"""\b\d+\b""")
def __init__(self): super().__init__() self.re_match = re.compile(r"""[^!"&':;?,\.\w\d ]+""")
def __init__(self): super().__init__() self.re_match = re.compile(r"\S*\d+\S*", re.IGNORECASE)
def __init__(self): super().__init__() self.re_match = re.compile(r"[!¡\"&':;¿?,\.]+")
def __init__(self): super().__init__() self.re_match = re.compile( r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""", re.IGNORECASE)
def __init__(self): super().__init__() self.re_match = re.compile( r"""(\s*(?P<punctuation>[!¡\"&':;¿?,\.]+)){2,}""") self.sub = lambda x: x.group("punctuation").strip()[0]
def __init__(self): super().__init__() self.re_match = re.compile(r"[^(\p{L}|\ )]+")
import regex as re import numpy as np INIT_ATOMIC_COORDINATES_RE = re.compile( r""" \sMODULE\sQUICKSTEP:\s\sATOMIC\sCOORDINATES\sIN\sangstrom\s*\n \n \s+Atom\s+Kind\s+Element\s+X\s+Y\s+Z\s+Z\(eff\)\s+Mass\s*\n (\n)? ( \s+(?P<atom>\d+) \s+(?P<kind>\d+) \s+(?P<element>\w+) \s+\d+ \s+(?P<x>[\s-]\d+\.\d+) \s+(?P<y>[\s-]\d+\.\d+) \s+(?P<z>[\s-]\d+\.\d+) \s+[\s-]\d+\.\d+ \s+[\s-]\d+\.\d+ \n )+ """, re.VERBOSE) def parse_init_atomic_coordinates(output_file): for match in INIT_ATOMIC_COORDINATES_RE.finditer(output_file): #print(match) # only get the last match init_atomic_coordinates = [] chemical_symbols = []
def __init__(self): super().__init__() self.re_match = re.compile(r"""[\",\\0]""")
# -*- coding: utf-8 -*- from __future__ import unicode_literals import regex as re from datetime import datetime from datetime import time from dateutil.relativedelta import relativedelta from dateutil.parser import parse from dateparser.utils import is_dateutil_result_obj_parsed, apply_timezone _UNITS = r'year|month|week|day|hour|minute|second' PATTERN = re.compile(r'(\d+)\s*(%s)\b' % _UNITS, re.I | re.S | re.U) class FreshnessDateDataParser(object): """ Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """ def __init__(self): self._now = None @property def now(self): return self._now if self._now else datetime.utcnow() @now.setter def now(self, value): self._now = value def _are_all_words_units(self, date_string): skip = [_UNITS, r'ago|in|\d+', r':|[ap]m']
#! /usr/bin/env python # Update a bunch of files according to a script. # The input file contains lines of the form <filename>:<lineno>:<text>, # meaning that the given line of the given file is to be replaced # by the given text. This is useful for performing global substitutions # on grep output: import os import sys import regex pat = '^\([^: \t\n]+\):\([1-9][0-9]*\):' prog = regex.compile(pat) class FileObj: def __init__(self, filename): self.filename = filename self.changed = 0 try: self.lines = open(filename, 'r').readlines() except IOError, msg: print '*** Can\'t open "%s":' % filename, msg self.lines = None return print 'diffing', self.filename def finish(self): if not self.changed: print 'no changes to', self.filename return try: os.rename(self.filename, self.filename + '~') fp = open(self.filename, 'w')
``b32decode`` function. """ from typing import Any import binascii import base64 import magic import regex as re from katana.unit import Unit as BaseUnit from katana.unit import NotApplicable from katana.util import is_good_magic import katana.util BASE32_PATTERN = rb"[A-Z2-7+/]+={0,6}" BASE32_REGEX = re.compile(BASE32_PATTERN, re.MULTILINE | re.DOTALL | re.IGNORECASE) class Unit(BaseUnit): GROUPS = ["raw", "decode", "base32"] """ These are "tags" for a unit. Considering it is a Raw unit, "raw" is included, as well as the tag "decode", and the unit name "base32". """ PRIORITY = 60 """ Priority works with 0 being the highest priority, and 100 being the lowest priority. 50 is the default priorty. This unit has a low priority.
def callback(ed): import regex ed.find_text(regex.compile('A bold word'))
def validateXbrlFinally(val, *args, **kwargs): if not (val.validateHMRCplugin) or not val.txmyType: return modelXbrl = val.modelXbrl modelDocument = modelXbrl.modelDocument _statusMsg = _("validating {0} filing rules").format( val.disclosureSystem.name) modelXbrl.profileActivity() modelXbrl.modelManager.showStatus(_statusMsg) if modelDocument.type in (ModelDocument.Type.INSTANCE, ModelDocument.Type.INLINEXBRL): labelHasNegativeTermPattern = re.compile(r".*[(].*\w.*[)].*") companyReferenceNumberContexts = defaultdict(list) for c1 in modelXbrl.contexts.values(): scheme, identifier = c1.entityIdentifier if scheme == "http://www.companieshouse.gov.uk/": companyReferenceNumberContexts[identifier].append(c1.id) uniqueFacts = {} # key = (qname, context hash, unit hash, lang) mandatoryFacts = {} mandatoryGDV = defaultdict(set) factForConceptContextUnitLangHash = defaultdict(list) hasCompaniesHouseContext = any( cntx.entityIdentifier[0] == "http://www.companieshouse.gov.uk/" for cntx in val.modelXbrl.contexts.values()) contextsUsed = set(f.context for f in modelXbrl.factsInInstance if f.context is not None) for cntx in contextsUsed: for dim in cntx.qnameDims.values(): if dim.isExplicit: _memName = dim.memberQname.localName m = memNameNumPattern.match(_memName) if m: l = m.group(1) n = int(m.group(2)) else: l = _memName n = None for _gdvType in (val.txmyType, "business"): gdv = genericDimensionValidation.get( _gdvType, EMPTYDICT).get(l) if gdv: # take first match break if (gdv and (n is None or (isinstance(gdv[0], int) and isinstance(gdv[1], int) and n >= gdv[0] and n <= gdv[1]))): gdvFacts = [f for f in gdv if isinstance(f, str)] if len(gdvFacts) == 1: mandatoryGDV[gdvFacts[0]].add( GDV(gdvFacts[0], None, _memName)) elif len(gdvFacts) == 2: mandatoryGDV[gdvFacts[0]].add( GDV(gdvFacts[0], gdvFacts[1], _memName)) mandatoryGDV[gdvFacts[1]].add( GDV(gdvFacts[1], gdvFacts[0], _memName)) def checkFacts(facts): for f in facts: cntx = f.context unit = f.unit if getattr( f, "xValid", 0) >= 4 and cntx is not None and f.concept is not None: factNamespaceURI = f.qname.namespaceURI factLocalName = f.qname.localName if factLocalName in mandatoryItems[val.txmyType]: mandatoryFacts[factLocalName] = f if factLocalName == "UKCompaniesHouseRegisteredNumber" and val.isAccounts: if hasCompaniesHouseContext: mandatoryFacts[factLocalName] = f for _cntx in contextsUsed: _scheme, _identifier = _cntx.entityIdentifier if _scheme == "http://www.companieshouse.gov.uk/" and f.xValue != _identifier: modelXbrl.error( "JFCVC.3316", _("Context entity identifier %(identifier)s does not match Company Reference Number (UKCompaniesHouseRegisteredNumber) Location: Accounts (context id %(id)s)" ), modelObject=(f, _cntx), identifier=_identifier, id=_cntx.id) if not f.isNil: factForConceptContextUnitLangHash[ f.conceptContextUnitLangHash].append(f) if f.isNumeric: if f.precision: modelXbrl.error( "HMRC.5.4", _("Numeric fact %(fact)s of context %(contextID)s has a precision attribute '%(precision)s'" ), modelObject=f, fact=f.qname, contextID=f.contextID, precision=f.precision) try: # only process validated facts if f.xValue < 0: label = f.concept.label(lang="en") if not labelHasNegativeTermPattern.match( label): modelXbrl.error( "HMRC.5.3", _("Numeric fact %(fact)s of context %(contextID)s has a negative value '%(value)s' but label does not have a bracketed negative term (using parentheses): %(label)s" ), modelObject=f, fact=f.qname, contextID=f.contextID, value=f.value, label=label) except AttributeError: pass # if not validated it should have failed with a schema error # check GDV if f.qname.localName in mandatoryGDV: _gdvReqList = mandatoryGDV[factLocalName] _gdvReqRemovals = [] for _gdvReq in _gdvReqList: if any(_gdvReq.memLocalName == dim.memberQname.localName for dim in cntx.qnameDims.values() if dim.isExplicit): _gdvReqRemovals.append(_gdvReq) if _gdvReq.altFact in mandatoryGDV: _gdvAltList = mandatoryGDV[_gdvReq.altFact] _gdvAltRemovals = [] for _gdvAlt in _gdvAltList: if any(_gdvAlt.memLocalName == dim.memberQname.localName for dim in cntx.qnameDims.values() if dim.isExplicit): _gdvAltRemovals.append(_gdvAlt) for _gdvAlt in _gdvAltRemovals: _gdvAltList.remove(_gdvAlt) if _gdvReqRemovals and not f.xValue: # fact was a mandatory name or description modelXbrl.error( "JFCVC.3315", _("Generic dimension members associated name/description has no text: %(fact)s" ), modelObject=f, fact=f.qname) for _gdvReq in _gdvReqRemovals: _gdvReqList.remove(_gdvReq) if f.modelTupleFacts: checkFacts(f.modelTupleFacts) checkFacts(modelXbrl.facts) if val.isAccounts: _missingItems = mandatoryItems[ val.txmyType] - mandatoryFacts.keys() if hasCompaniesHouseContext and "UKCompaniesHouseRegisteredNumber" not in mandatoryFacts: _missingItems.add("UKCompaniesHouseRegisteredNumber") if _missingItems: modelXbrl.error("JFCVC.3312", _("Mandatory facts missing: %(missingItems)s"), modelObject=modelXbrl, missingItems=", ".join(_missingItems)) f = mandatoryFacts.get("StartDateForPeriodCoveredByReport") if f is not None and f.xValue < _6_APR_2008: modelXbrl.error( "JFCVC.3313", _("Period Start Date (StartDateForPeriodCoveredByReport) must be 6 April 2008 or later, but is %(value)s" ), modelObject=f, value=f.value) memLocalNamesMissing = set( "{}({})".format(_gdvRec.memLocalName, _gdvRec.factNames) for _gdv in mandatoryGDV.values() for _gdvRec in _gdv) if memLocalNamesMissing: modelXbrl.error( "JFCVC.3315", _("Generic dimension members have no associated name or description item, member names (name or description item): %(memberNames)s" ), modelObject=modelXbrl, memberNames=", ".join(sorted(memLocalNamesMissing))) aspectEqualFacts = defaultdict(list) for hashEquivalentFacts in factForConceptContextUnitLangHash.values(): if len(hashEquivalentFacts) > 1: for f in hashEquivalentFacts: aspectEqualFacts[(f.qname, f.contextID, f.unitID, f.xmlLang)].append(f) for fList in aspectEqualFacts.values(): f0 = fList[0] if any(not f.isVEqualTo(f0) for f in fList[1:]): modelXbrl.error( "JFCVC.3314", "Inconsistent duplicate fact values %(fact)s: %(values)s.", modelObject=fList, fact=f0.qname, contextID=f0.contextID, values=", ".join(f.value for f in fList)) aspectEqualFacts.clear() del factForConceptContextUnitLangHash, aspectEqualFacts modelXbrl.profileActivity(_statusMsg, minTimeToShow=0.0) modelXbrl.modelManager.showStatus(None)
import math import multiprocessing as mp import numpy as np import os import pandas as pd import pydash as _ import regex as re import torch import ujson import yaml import pprint NUM_CPUS = mp.cpu_count() DF_FILE_EXT = ['.csv', '.xlsx', '.xls'] FILE_TS_FORMAT = '%Y_%m_%d_%H%M%S' RE_FILE_TS = re.compile(r'(\d{4}_\d{2}_\d{2}_\d{6})') RE_INDENT = re.compile('(^\n)|(?!\n)\s{2,}|(\n\s+)$') SPACE_PATH = ['agent', 'agent_space', 'aeb_space', 'env_space', 'env'] class LabJsonEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() else: return str(obj)
def _generate_as_number_regex(self, as_numbers): """Generate regex for finding AS number.""" # Match a non-digit, any of the AS numbers and another non-digit # Using lookahead and lookbehind to match on context but not include that context in the match self.as_num_regex = regex.compile('(\D|^)\K(' + '|'.join(as_numbers) + ')(?=\D|$)')
# -*- coding: utf-8 -*- # @Date : 2020-03-23 # @Author : zhaoguocai # @QQ : 30516864 # @使用交流群 :685096991 a = r'(\$\w+\((?>[^()]+|(?R))*\))' import time start_time = time.time() print(1111) end_time = time.time() print(end_time - start_time) import regex s = '$a1($a2($a3($a99("dsd")),a4,$a5(1,2),a6),a7,$a8($a9()))' keyword_regex = regex.compile(r'(\$\w+\((?>[^()]+|(?R))*\))') keyword_list = regex.findall(keyword_regex, s) print(keyword_list)
def _generate_sensitive_word_regex(cls, sensitive_words): """Compile and return regex for the specified list of sensitive words.""" return regex.compile('({})'.format('|'.join(sensitive_words)), regex.IGNORECASE)
(c) Copyright 2013 Mark V Systems Limited, All rights reserved. References: https://xbrl.frc.org.uk (taxonomies, filing requirements, consistency checks) https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/434597/joint-filing-validation-checks.pdf ''' import os from arelle import ModelDocument, XmlUtil from arelle.ModelValue import qname, dateTime, DATE try: import regex as re except ImportError: import re from collections import defaultdict memNameNumPattern = re.compile(r"^([A-Za-z-]+)([0-9]+)$") compTxmyNamespacePattern = re.compile( r"http://www.govtalk.gov.uk/uk/fr/tax/uk-hmrc-ct/[0-9-]{10}") EMPTYDICT = {} _6_APR_2008 = dateTime("2008-04-06", type=DATE) commonMandatoryItems = { "EntityCurrentLegalOrRegisteredName", "StartDateForPeriodCoveredByReport", "EndDateForPeriodCoveredByReport", "BalanceSheetDate" } mandatoryItems = { "ukGAAP": commonMandatoryItems | { "DateApprovalAccounts", "NameDirectorSigningAccounts", "EntityDormant", "EntityTrading", "DateSigningDirectorsReport", "DirectorSigningReport" }, "charities": commonMandatoryItems | {
def __init__(self, patterns: List[str]) -> None: self.patterns = patterns self.joined_patterns = re.compile("|".join(self.patterns))
def generate_default_sensitive_item_regexes(): """Compile and return the default password and community line regexes.""" combined_regexes = default_pwd_line_regexes + default_com_line_regexes + \ extra_password_regexes return [[(regex.compile(_ALLOWED_REGEX_PREFIX + regex_), num) for regex_, num in group] for group in combined_regexes]
def testDependenciesAppearInTheSourceBeforeFilesThatRequiredThem(self): ''' Test dependencies appear in the source before files that required them ''' self.assertRegexpMatches(str(self.get_asset('application.js')),re.compile(r"""Project.+Users.+focus""",re.M|re.S))
def start(self): self.setVisible(True) self.pi.start() self.is_running = True def stop(self): self.setVisible(False) self.pi.stop() self.is_running = False # }}} quote_map = {'"': '"“”', "'": "'‘’"} qpat = regex.compile(r'''(['"])''') spat = regex.compile(r'(\s+)') invisible_chars = '(?:[\u00ad\u200c\u200d]{0,1})' SEARCH_RESULT_ROLE = Qt.ItemDataRole.UserRole RESULT_NUMBER_ROLE = SEARCH_RESULT_ROLE + 1 SPINE_IDX_ROLE = RESULT_NUMBER_ROLE + 1 def text_to_regex(text): has_leading = text.lstrip() != text has_trailing = text.rstrip() != text if text and not text.strip(): return r'\s+' ans = [] for wpart in spat.split(text.strip()): if not wpart.strip():
Bracket = '[][(){}]' Special = group(r'\r?\n', r'[:;.,`@]') Funny = group(Operator, Bracket, Special) PlainToken = group(Number, Funny, String, Name) Token = Ignore + PlainToken # First (or only) line of ' or " string. ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r'\\\r?\n'), _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n')) PseudoExtras = group(r'\\\r?\n', Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) tokenprog = re.compile(Token, re.UNICODE) pseudoprog = re.compile(PseudoToken, re.UNICODE) single3prog = re.compile(Single3) double3prog = re.compile(Double3) _strprefixes = ( _combinations('r', 'R', 'f', 'F') | _combinations('r', 'R', 'b', 'B') | {'u', 'U', 'ur', 'uR', 'Ur', 'UR'} ) endprogs = {"'": re.compile(Single), '"': re.compile(Double), "'''": single3prog, '"""': double3prog, **{f"{prefix}'''": single3prog for prefix in _strprefixes}, **{f'{prefix}"""': double3prog for prefix in _strprefixes}, **{prefix: None for prefix in _strprefixes}}
def _compile_regexes(self): for key, value in self.patterns.items(): self.patterns[key] = regex.compile(value)