Beispiel #1
0
def wordRegex():
    """
    #I'm including the code to create the regex, which makes it more readable.
    Note that this uses *unicode*: among other things, that means that it needs to be passed
    a unicode-decoded string: and that we have to use the "regex" module instead of the "re" module. 
    Python3 will make this, perhaps, easier.
    (See how it says import regex as re up there? Yikes.)
    """
    fp = os.path.realpath(__file__)
    fp = os.path.dirname(fp)
    fp = os.path.dirname(fp)
    cnf = os.path.join(fp, 'bookworm.cnf')

    with open(cnf) as ff:
	for line in ff:
	    if 'database' in line:
		bwname = line.split('database = ')[-1]	

    if '_phonemes' in bwname:
	print('Tokenizing text using the PHONEME regex')
	bigregex = re.compile(r'\b\w*[^\s]', re.UNICODE|re.IGNORECASE)
    else:
	print('Tokenizing text using the WORD regex')
        MasterExpression = ur"\p{L}+"
        possessive = MasterExpression + ur"'s"
        numbers = r"(?:[\$])?\d+"
        decimals = numbers + r"\.\d+"
        abbreviation = r"(?:mr|ms|mrs|dr|prof|rev|rep|sen|st|sr|jr|ft|gen|adm|lt|col|etc)\."
        sharps = r"[a-gjxA-GJX]#"
        punctuators = r"[^\p{L}\p{Z}]"
        """
        Note: this compiles looking for the most complicated words first, and as it goes on finds simpler and simpler forms 
        """
        bigregex = re.compile("|".join([decimals,possessive,numbers,abbreviation,sharps,punctuators,MasterExpression]),re.UNICODE|re.IGNORECASE)
    return bigregex
Beispiel #2
0
def fetch_post_id_and_site_from_url(url):
    if url is None:
        return None
    post_type_regex = r"\/\d+#\d+$"
    post_type = ""
    search_regex = ""
    if regex.compile(post_type_regex).search(url):
        post_type = "answer"
        search_regex = r"^(?:https?:)?\/\/([\w.]+)/questions/\d+/.+/(\d+)#\d+$"
    else:
        post_type = "question"
        search_regex = r"^(?:https?:)?\/\/([\w.]+)/questions/(\d+)(?:/.*)?$"
    found = regex.compile(search_regex).search(url)
    if found is not None:
        try:
            post_id = found.group(2)
            post_site = found.group(1)
            return (post_id, post_site, post_type)
        except:
            return None
    search_regex = r"^(?:https?:)?\/\/([\w.]+)/(q|a)/(\d+)(?:/\d+)?/?"
    found = regex.compile(search_regex).search(url)
    if found is None:
        return None
    try:
        post_id = found.group(3)
        post_site = found.group(1)
        post_type = "question" if found.group(2) == "q" else "answer"
        return (post_id, post_site, post_type)
    except:
        return None
Beispiel #3
0
def main(in_file, out_file, arg_i, arg_v ):
    global module_dict, import_dict, qname_dict, fixup_dict

    indent_re = regex.compile('^\s*(#*)')
    qnames_re = regex.compile('Q\w+')

    # If we will do suggested imports, load all the classnames
    # in the module_dict. This may take some little while!
    if arg_i :
        load_module_dict()

    line = in_file.readline()
    while line:

        out_lines = []
        indent_m = indent_re.match(line)
        skip = 0 < len(indent_m.group(1)) # is a comment or
        skip |= line.startswith('import') # ..an import or
        skip |= (line.startswith('from') and (0 > line.find('__future')) )
        if not skip :
            # set up appropriate indent for comments
            indent = indent_m.group(0) + '#! '

            # Note all the QXxxx names in this line
            qnames_in_line = qnames_re.findall(line)

            if arg_i :
                # Add each to the set of its module
                for qname in qnames_in_line :
                    if qname in module_dict : # only valid ones
                        import_dict[module_dict[qname]].add(qname)

            # Check the QXxxx names for the troublesome ones.
            for qname in qnames_in_line :
                if qname in qname_dict :
                    out_lines += qname_dict[qname](line)

            # Run through the non-QXxxx indicator strings.
            for (indicator, fixup) in fixup_dict.items() :
                if indicator in line :
                    out_lines += fixup(line)

            # Write any annotation comments.
            for comment in out_lines:
                out_file.write( indent + comment + '\n')
        #endif skip

        out_file.write( line )
        line = in_file.readline()
    # end while line
    if arg_i :
        out_file.write('\n\n#! Suggested import statements for class-names seen above.\n')
        out_file.write('#! You must move these to the top of the file replacing any\n')
        out_file.write('#! existing PyQt4 import statements.\n')
        for (mod_name, class_set) in import_dict.items() :
            if len(class_set) : # not an empty set
                out_file.write('from PyQt5.{0} import\n   ('.format(mod_name))
                join_string = ',\n    ' if arg_v else ', '
                out_file.write(join_string.join(sorted(class_set)))
                out_file.write(')\n')
Beispiel #4
0
 def __init__(self, start=None, end=None, void=None, structs=None):
     self.start = start if start else re.compile(r"<(\w+).*?(?<!/)>")
     self.end = end if end else re.compile(r"</(\w+)>")
     self.void = void if void else re.compile(r"<(\w+).*?/>")
     self.stags = set()
     self.etags = set()
     self.vtags = set()
Beispiel #5
0
def readConfigFile (
	source		# pathname to config file to read
	):
	# Purpose: read the configuration file at 'source', parse it,
	#	store values in a dictionary
	# Returns: the dictionary parsed from 'source'
	# Assumes: 'source' exists
	# Effects: reads from the file system
	# Throws: IOError if there are problems reading

	fp = open (source, 'r')
	lines = fp.readlines ()
	fp.close ()

	ignore_line = regex.compile ('[ \t]*#')		# comment line
	data_line = regex.compile ('[ \t]*'
				'\([^ \t]+\)'
				'[ \t]*\(.*\)')	
	dict = {}

	for line in lines:
		if ignore_line.match (line) == -1:
			if data_line.match (line) != -1:
				(parameter, value) = data_line.group (1,2)
				dict [string.upper (parameter)] = value
	return dict
Beispiel #6
0
def build_check_whitespace_timestamp(t):
    CR_RE = re.compile(r'\r')
    TRAILING_WHITESPACE_RE = re.compile(r'\s+\n\Z')
    NO_NEWLINE_RE = re.compile(r'[^\n]\Z')
    ALL_WHITESPACE_RE = re.compile(r'\s+\Z')
    errors = 0
    for filename in sorted(t.newer(t.dependencies)):
        whitespace = False
        for lineno, line in enumerate(open(filename)):
            if CR_RE.search(line):
                t.info('%s:%d: carriage return character in line', filename, lineno + 1)
                errors += 1
            if TRAILING_WHITESPACE_RE.search(line):
                t.info('%s:%d: trailing whitespace', filename, lineno + 1)
                errors += 1
            if NO_NEWLINE_RE.search(line):
                t.info('%s:%d: no newline at end of file', filename, lineno + 1)
                errors += 1
            whitespace = ALL_WHITESPACE_RE.match(line)
        if whitespace:
            t.info('%s: trailing whitespace at end of file', filename)
            errors += 1
    if errors:
        t.error('%d whitespace errors' % (errors,))
    t.touch()
Beispiel #7
0
def clean_line(line):
    line = strip_nikkud(line)
    replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right?
    line = multiple_replace(line, replace_dict, using_regex=True)
    # line = re.sub(u'[:\?]', '', line)
    # line = re.sub(u'”', u'"', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    f_ayyen = re.search(reg_ayyen_tur, line)
    f_lo_manu = re.search(reg_lo_manu, line)

    if f_ayyen:
        line = line[:f_ayyen.start()]
    if f_lo_manu:
        line = re.sub(f_lo_manu.group('a'), u"", line)
    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
def _prep_cleanup_re():
    symbols = []
    digits  = []
    white   = []
    for c in range(0x10FFFF):
        x = chr(c)
        cat = unicodedata.category(x)
        # All punctuation, symbols, and whitespace, C0 and C1 controls,
        # and "format effectors" (e.g. ZWNJ, RLE).  Cn (unassigned),
        # Cs (surrogate), and Co (private use) are not stripped.
        if cat[0] in ('P', 'S'):
            # Don't strip leading and trailing hyphens and apostrophes.
            # FIXME: this really ought to be an exhaustive list of P-
            # and S-class characters that can be *part of* a word.
            if x in ('-', '‐', '\'', '’'): continue
            # These characters need to be escaped inside a character class.
            # '-' is not included because the preceding 'if' removed it.
            if (x in '\\', '[', ']'):
                symbols.append('\\' + x)
            else:
                symbols.append(x)
        elif cat[0] == 'N':
            digits.append(x)
        elif cat[0] == 'Z' or cat in ('Cc', 'Cf'):
            white.append(x)
    symbols = "".join(symbols)
    digits  = "".join(digits)
    white   = "".join(white)
    return (
        re.compile("^[" + symbols + white  + "]+"),
        re.compile("["  + symbols + white  + "]+$"),
        re.compile("^[" + symbols + white  + digits + "'’\\-‐" + "]+$"),
        re.compile("["  + symbols + digits + "]+")
    )
Beispiel #9
0
def updateline(file, key, value, casefold = 1):
	try:
		f = open(file, 'r')
		lines = f.readlines()
		f.close()
	except IOError:
		lines = []
	pat = key + ':\(.*\)\n'
	if casefold:
		prog = regex.compile(pat, regex.casefold)
	else:
		prog = regex.compile(pat)
	if value is None:
		newline = None
	else:
		newline = '%s: %s' % (key, value)
	for i in range(len(lines)):
		line = lines[i]
		if prog.match(line) == len(line):
			if newline is None:
				del lines[i]
			else:
				lines[i] = newline
			break
	else:
		if newline is not None:
			lines.append(newline)
	f = open(tempfile, 'w')
	for line in lines:
		f.write(line)
	f.close()
Beispiel #10
0
	def __load_txt(self):
		rn1 = r"(?P<authors>((\pL\. ?(\pL\. )?\pL+,? )|(\pL+ \pL\. ?(\pL\.)?,? )" #regular for authors
		rn2 = r"|(\p{Lu}\p{Ll}+ \p{Lu}\p{Ll}+,? )"
		rn3 = r")+)"
		ra_ru = r"(?P<article>\p{Lu}\p{Ll}+ \p{Ll}+.*?) *\/\/ *" #regular for article
		ra_eng = r"(?P<article>\p{Lu}.*?) *\/\/ *" #regular for article
		rj = r'(?P<source>[ \pL"“”]+)' #regular for source
		rm = r"(?P<misc>.+)" #regular for misc
		reg_ru = re.compile(rn1+rn2+rn3+ra_ru+rj+rm, re.UNICODE)
		reg_eng = re.compile(rn1+rn3+ra_eng+rj+rm, re.UNICODE)
		data = []
		f = open(self.filename, 'r')
		content = f.read()
		items = content.split('\n')
		for item in items:
			res = None
			if isEnglish(item[:15]):
				res = reg_eng.match(item.strip())
			else:
				res = reg_ru.match(item.strip())
			if res != None:
				publication = Publication()
				publication.authors = Author.parseAuthors(res.group("authors"))


				data.append({"authors": split_authors(res.group("authors")), "article": res.group("article"), "source": res.group("source"), "misc": res.group("misc")})
			else:
				print("Wrong line: " + item)
		return data
Beispiel #11
0
def sample1(filename, aft=None, fore=None, top=None, home=None):
    doc = SeriesDocument('HTMLgen.rc')
    doc.goprev,doc.gonext,doc.gotop,doc.gohome = aft,fore,top,home
    doc.background = '../image/texturec.jpg'
    doc.banner = ('../image/historic.gif', 472, 60)
    doc.author = '1776 Thomas Jefferson'
    doc.email = '*****@*****.**'
    doc.logo = ('../image/eagle21.gif', 64, 54)
    # parse Declaration of Independence
    re_hline = regex.compile('^--+$')
    re_title = regex.compile('^Title:\(.*$\)')
    font2 = Font(size='+2')
    s = open(os.path.join(datadir, 'DoI.txt')).read()
    paragraphs = regsub.split(s, '\n\([\t ]*\n\)+')
    for para in paragraphs:
        if not para: continue
        if re_title.search(para) > -1:
            doc.title = re_title.group(1)
        elif re_hline.search(para) > -1:
            doc.append(HR())
        else:
            p = Paragraph( para )
            # using \` to match beginning of paragraph
            # ^ won't work because it'll match all the newlines
            n = p.markup('\`\(\w\)', font2, reg_type='regex')
            doc.append(p)
    doc.write(os.path.join(htmldir, filename))
Beispiel #12
0
 def compileRegex(string, flags):
     try:
         return regex.compile(string, convertRegex(flags))
     except:
         for od in HEXADECIMAL_PATTERNS:
             string = string.replace(od[0], od[1])
         return regex.compile(string, convertRegex(flags))
Beispiel #13
0
def tlg_plaintext_cleanup(text, rm_punctuation=False, rm_periods=False):
    """Remove and substitute post-processing for Greek TLG text.
    TODO: Surely more junk to pull out. Please submit bugs!
    TODO: {.+?}|\(.+?\) working?
    TODO: This is a rather slow now, help in speeding up welcome.
    """
    remove_comp = regex.compile(r'-\n|«|»|<|>|\.\.\.|‘|’|_|{.+?}|\(.+?\)|[a-zA-Z0-9]', flags=regex.VERSION1)
    text = remove_comp.sub('', text)

    new_text = None
    if rm_punctuation:
        new_text = ''
        punctuation = [',', '·', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']
        if rm_periods:
            punctuation += ['.', ';']
        for char in text:
            # second try at rming some punctuation; merge with above regex
            if char in punctuation:
                pass
            else:
                new_text += char
    if new_text:
        text = new_text

    # replace line breaks w/ space
    replace_comp = regex.compile(r'\n')
    text = replace_comp.sub(' ', text)

    comp_space = regex.compile(r'\s+')
    text = comp_space.sub(' ', text)

    return text
def expand_parens(str, include_spaces=False):
    output = []

    if "‣" in str:
        for i in str.split("‣"):
            output.extend(expand_parens(i))
        return output

    if include_spaces:
        regex1 = re.compile(r"(^.*)\((.+)\)(.*$)")
        regex2 = re.compile(r"(^.*)\((.+)\)(.*$)")
    else:
        regex1 = re.compile(r"(^.*[^ ])\(([^ ]+)\)(.*$)")
        regex2 = re.compile(r"(^.*)\(([^ ]+)\)([^ ].*$)")

    re_match1 = regex1.search(str)
    re_match2 = regex2.search(str)
    if re_match1:
        within = re_match1.group(1) + re_match1.group(2) + re_match1.group(3)
        without = re_match1.group(1) + re_match1.group(3)
    elif re_match2:
        within = re_match2.group(1) + re_match2.group(2) + re_match2.group(3)
        without = re_match2.group(1) + re_match2.group(3)
    else:
        return [str]

    output = [clean_str(without), clean_str(within)]

    return output
Beispiel #15
0
 def test_post(title, body, user_name, site, is_answer, body_is_summary):
     result = []
     for rule in FindSpam.rules:
         body_to_check = body
         if rule['stripcodeblocks']:
             body_to_check = regex.sub("<pre>.*?</pre>", "", body, flags=regex.DOTALL)
             body_to_check = regex.sub("<code>.*?</code>", "", body_to_check, flags=regex.DOTALL)
         if rule['all'] != (site in rule['sites']):
             matched_title = regex.compile(rule['regex'], regex.UNICODE).findall(title)
             matched_username = regex.compile(rule['regex'], regex.UNICODE).findall(user_name)
             matched_body = regex.compile(rule['regex'], regex.UNICODE).findall(body_to_check)
             if matched_title and rule['title']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_title):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "title"))
             if matched_username and rule['username']:
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_username):
                         result.append(rule['reason'])
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", "username"))
             if matched_body and rule['body'] and (not body_is_summary or rule['body_summary']):
                 type_of_post = "answer" if is_answer else "body"
                 try:
                     if getattr(FindSpam, "%s" % rule['validation_method'])(matched_body):
                         result.append(rule['reason'].replace("{}", type_of_post))
                 except KeyError:  # There is no special logic for this rule
                     result.append(rule['reason'].replace("{}", type_of_post))
     return result
Beispiel #16
0
  def rebuildregexes(self):
    """
    rebuild a regex for priority

    will need a colored and a noncolored regex for each priority
    """
    colorres = []
    noncolorres = []
    for trig in self.uniquelookup.values():
      if trig['enabled']:
        if 'matchcolor' in trig \
            and trig['matchcolor']:
          colorres.append("(?P<%s>%s)" % (trig['unique'], trig['nonamedgroups']))
        else:
          noncolorres.append("(?P<%s>%s)" % (trig['unique'], trig['nonamedgroups']))

    if colorres:
      try:
        self.regex['color'] = re.compile("|".join(colorres))
      except re.error:
        self.api('send.traceback')('Could not compile color regex')
    else:
      self.regex['color'] = ""

    try:
      self.regex['noncolor'] = re.compile("|".join(noncolorres))
    except re.error:
      self.api('send.traceback')('Could not compile regex')
def clean_line(line):
    line = strip_nikkud(line)
    line = re.sub(u':', '', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    pos = re.search(reg_ayyen_tur, line)

    if pos:
        line = line[:pos.start()]

    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
Beispiel #18
0
def get_user_from_list_command(cmd):  # for example, !!/addblu is a list command
    cmd_merged_spaces = regex.sub("\\s+", " ", cmd)
    cmd_parts = cmd_merged_spaces.split(" ")

    uid = -1
    site = ""

    if len(cmd_parts) == 1:
        uid_site = get_user_from_url(cmd_parts[0])
        if uid_site is not None:
            uid, site = uid_site
    elif len(cmd_parts) == 2:
        uid = cmd_parts[0]
        site = cmd_parts[1]
        digit_re = regex.compile("^[0-9]+$")
        site_re = regex.compile(r"^(\w+\.stackexchange\.com|\w+\.(com|net))$")
        if not digit_re.match(uid):
            uid = -1
            site = ""
        elif not site_re.match(site):
            exists, name = datahandling.check_site_and_get_full_name(site)
            if exists:
                return uid, name
            else:
                return -2, name
    return uid, site
Beispiel #19
0
def fetch_post_id_and_site_from_url(url):
    if url is None:
        return None
    trimmed_url = rebuild_str(url)
    post_type_regex = r"(?:\/\d+)?#\d+$"
    post_type = ""
    search_regex = ""
    if regex.compile(post_type_regex).search(trimmed_url):
        post_type = "answer"
        search_regex = r"^(?:https?:)?\/\/([\w.]+)\/questions\/\d+\/.+[/#](\d+)(?:#\d+)?$"
    else:
        post_type = "question"
        search_regex = r"^(?:https?:)?\/\/([\w.]+)/questions/(\d+)(?:/.*)?$"
    found = regex.compile(search_regex).search(trimmed_url)
    if found is not None:
        try:
            post_id = found.group(2)
            post_site = found.group(1)
            return (post_id, post_site, post_type)
        except IndexError:
            return None
    search_regex = r"^(?:https?:)?\/\/([\w.]+)/(q|a)/(\d+)(?:/\d+)?/?"
    found = regex.compile(search_regex).search(trimmed_url)
    if found is None:
        return None
    try:
        post_id = found.group(3)
        post_site = found.group(1)
        post_type = "question" if found.group(2) == "q" else "answer"
        return (post_id, post_site, post_type)
    except IndexError:
        return None
Beispiel #20
0
    def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str]) -> None:
        """
        Builds a codec converting between graphemes/code points and integer
        label sequences.

        charset may either be a string, a list or a dict. In the first case
        each code point will be assigned a label, in the second case each
        string in the list will be assigned a label, and in the final case each
        key string will be mapped to the value sequence of integers. In the
        first two cases labels will be assigned automatically.

        As 0 is the blank label in a CTC output layer, output labels and input
        dictionaries are/should be 1-indexed.

        Args:
            charset (unicode, list, dict): Input character set.
        """
        if isinstance(charset, dict):
            self.c2l = charset
        else:
            self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)}
        # map integer labels to code points because regex only works with strings
        self.l2c = {}  # type: Dict[str, str]
        for k, v in self.c2l.items():
            self.l2c[''.join(chr(c) for c in v)] = k

        # sort prefixes for c2l regex
        self.c2l_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.c2l.keys(), key=len, reverse=True)))
        # sort prefixes for l2c regex
        self.l2c_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.l2c.keys(), key=len, reverse=True)))
Beispiel #21
0
	def __init__(self, Normalizer, normalization = False):
		super(RegExp, self).__init__()

		self.normalization = normalization

		self.Normalizer = Normalizer

		self.matrices = {
 			"primarySource" : {
				"matcher" : self.generate("primarySource", False),
				"grouper" : self.generate("primarySource")
			},
			"secondarySource" : {
				"matcher" : self.generate("secondarySource", False),
				"grouper" : self.generate("secondarySource")
			},
			"quotes" : {
				"matcher" : self.generate("quote", False),
				"grouper" : self.generate("quote")
			},
			"senses" : {
				"grouper" : re.compile("^([1-9]{1,3}|[abcdefABCDEFαβγδ]{1}|IX|IV|V?I{0,3})$"),
				"splitter" : re.compile("[–\,]{0,1}\s([1-9]{1,3}|[abcdefABCDEFαβγδ]{1}|IX|IV|V?I{0,3})\)\s")
			},
			"greek" : {
				"matcher" :  self.generate("greek"),
				"grouper" : re.compile("(?P<match>(?:(?:[\p{Greek}µ']+)+[\s\.\,]*)+)")
			},
			"firstLine" : {
				"grouper" : self.generate("firstLine")
			}
		}
Beispiel #22
0
def add_spaces(text, exclude=None):
    if exclude:
        patt_exclude = regex.escape(exclude)
        patt_eng_cjk = regex.compile(u"([[%s]--%s])([%s])" % (CHAR_ENG_LEFT, patt_exclude, CHAR_CJK))
        patt_cjk_eng = regex.compile(u"([%s])([[%s]--%s])" % (CHAR_CJK, CHAR_ENG_RIGHT, patt_exclude))
    else:
        patt_eng_cjk = PATTERN_ENG_CJK
        patt_cjk_eng = PATTERN_CJK_ENG

    def add_space_func(index1, index2):
        def add_space(match):
            return u"%s %s" % (match.group(index1), match.group(index2))

        return add_space

    text = patt_cjk_eng.subn(add_space_func(1, 2), text)[0]
    text = patt_eng_cjk.subn(add_space_func(1, 2), text)[0]

    if not (exclude and '"' in exclude):
        # XXX"YYY"XXX -> XXX "YYY" XXX
        # where X and Y are CJK charaters
        is_left_dquote = True
        is_left_squote = True
        out = StringIO.StringIO()
        for i in xrange(len(text)):
            prev_char = text[i - 1] if i > 0 else None
            cur_char = text[i]
            next_char = text[i + 1] if i < len(text) - 1 else None
            if cur_char == u'"':
                if is_left_dquote:
                    if _is_cjk(prev_char):
                        out.write(u' "')
                    else:
                        out.write(u'"')
                    is_left_dquote = False
                else:
                    if _is_cjk(next_char):
                        out.write(u'" ')
                    else:
                        out.write(u'"')
                    is_left_dquote = True
            elif cur_char == u"'":
                if is_left_squote:
                    if _is_cjk(prev_char):
                        out.write(u" '")
                    else:
                        out.write(u"'")
                    is_left_squote = False
                else:
                    if _is_cjk(next_char):
                        out.write(u"' ")
                    else:
                        out.write(u"'")
                    is_left_squote = True
            else:
                out.write(cur_char)
        text = out.getvalue()
        out.close()

    return text
Beispiel #23
0
 def setliteral(self, tag):
     self.literal = 1
     re = "%s%s[%s]*%s" % (ETAGO, tag, string.whitespace, TAGC)
     if self._normfunc is string.lower:
         self._lit_etag_re = regex.compile(re, regex.casefold)
     else:
         self._lit_etag_re = regex.compile(re)
Beispiel #24
0
def all_caps_text(s, site):
    s = regex.sub("<[^>]*>", "", s)   # remove HTML tags
    s = regex.sub("&\w+;", "", s)     # remove HTML entities
    if len(s) <= 150 and regex.compile(ur"SQL|\b(ERROR|PHP|QUERY|ANDROID|CASE|SELECT|HAVING|COUNT|GROUP|ORDER BY|INNER|OUTER)\b").search(s):
        return False, ""   # common words in non-spam all-caps titles
    if len(s) >= 25 and regex.compile(ur"^(?=.*\p{upper})\P{lower}*$", regex.UNICODE).search(s):
        return True, "All in caps"
    def _replace_for(self, text, nested_position, keyword_number=1):
        """
        Finds and replace the % for: ... % endfor loops
        of the mail.template. It will create keyword records for
        each loop found.
        :param text: mail.template text
        :param nested_position: counts how nested if the current pass
        :param keyword_number: counts how many for we found
        :return: simplified text without the if code, keywords found
        """
        # Regex for finding text wrapped in loops
        loop_regex = r'(% for .*?:$)(.*?)(% endfor)'
        ul_loop_regex = r'(?:<ul[^<]*?)(% for .*?:$)(.*?)(% endfor)(.*?</ul>)'

        # First scan for ul_loops
        for_pattern = re.compile(ul_loop_regex, flags=re.DOTALL | re.MULTILINE)
        simple_text, found_keywords = self._replace_for_type(
            text, nested_position, keyword_number, 'for_ul', for_pattern)
        keyword_number += len(found_keywords)

        # Then scan for regular loops
        for_pattern = re.compile(loop_regex, flags=re.DOTALL | re.MULTILINE)
        simple_text, keywords = self._replace_for_type(
            simple_text, nested_position, keyword_number, 'for', for_pattern)
        found_keywords |= keywords

        return simple_text, found_keywords
    def _reload_allowed_list_file(self):
        '''(Re)loads the list with rules for non-segment borders, e.g stops
        the possible segment border being split (if not forced by a forcing
        rule specified for the stop rule. The stop rules are pairs of two rules
        of which the first is matched against the segment to the left, and the
        latter is matched against the segment to the right. The filename is
        given in the __init__, and the default file is "./data/stop_list".

        See the __init__() and segment() function for more about the algorithm.
        ATTENTION note that verbose regexps are used.'''

        with open(self._allowed_list_filename, 'r') as f:
            _filedata = f.readlines()
        
        self._allowed_regexps = list()
        _rule_left = ''
        _rule_right = ''

        for i in range(len(_filedata)):
            # rules must be specified in correct order: first left, then right
            if _filedata[i].startswith('LEFT:'):
                _rule_left = regex.compile(_filedata[i][5:], regex.VERBOSE)
            elif _filedata[i].startswith('RIGHT:'):
                _rule_right = regex.compile(_filedata[i][6:], regex.VERBOSE)
                self._allowed_regexps.append((_rule_left, _rule_right))
                _rule_left = ''
                _rule_right = ''
            else:
                # everything else is ignored
                continue
Beispiel #27
0
 def __init__(self, directory_name):
     self.directory = directory_name
     self.unigram_frequency = Counter()
     self.trigrams = dict()
     self.trigram_load_pattern = re2.compile(r'^([^ ]*) ([^ ]*) ([^\t]*)\t(\d*)')
     self.middle_token_pattern = re2.compile(r'^\p{posix_alnum}*$', re2.UNICODE)
     super(FileScorer, self).__init__()
    def __init__(self, src, javaFlag=0):
        Doxy2SWIG.__init__(self, src, javaFlag)
        """ Turns on the title, brief description and detailed description markup.
        Turn them off when inside member documentatation.

        """
        self.FilterTitle = True
        self.sitkClassName=''
        self.EmptyText = False
        # compiled regular expressions
        # common formula types in xml version of documentation
        self.dollarFormula = re.compile("^\\$(.+)\\$$")
        self.arrayFormula = re.compile("^\\\\\\[(.+)\\\\\\]$")
        # more complex formula layout, that breaks R documentation
        # checks.
        self.mathstuff1 = re.compile(r"\\begin\{array\}\{[^}]+\}")
        self.mathstuff2 = re.compile(r"\\begin\{array\}")
        self.mathstuff3 = re.compile(r"\\end\{array\}")
        # a complex recursive regular expression, to deal with formula
        # inside mbox and text structures
        self.mathstuff4 = regex.compile(r"\\mbox({((?>[^}{]*(?1)?)*)})", flags=regex.V1)
        self.mathstuff5 = regex.compile(r"\\text({((?>[^}{]*(?1)?)*)})", flags=regex.V1)
        # the special doxygen tags - note - not greedy
        self.mathstuff6 = re.compile(r"\\f\$(.+?)\\f\$")
        # alignment tags
        self.mathstuff7 = re.compile(r" & ")
Beispiel #29
0
    def __init__(self):
        # These attributes are set by the parse method
        self.doc = None
        self.para = None
        self.current_string = None
        self.flow = None

        self.stateMachine = StateMachine()
        self.stateMachine.add_state("PARA", self._para)
        self.stateMachine.add_state("ESCAPE", self._escape)
        self.stateMachine.add_state("END", None, end_state=1)
        self.stateMachine.add_state("ANNOTATION-START", self._annotation_start)
        self.stateMachine.add_state("CITATION-START", self._citation_start)
        self.stateMachine.add_state("BOLD-START", self._bold_start)
        self.stateMachine.add_state("ITALIC-START", self._italic_start)
        self.stateMachine.add_state("CODE-START", self._code_start)
        self.stateMachine.add_state("QUOTES-START", self._quotes_start)
        self.stateMachine.add_state("INLINE-INSERT", self._inline_insert)
        self.stateMachine.add_state("CHARACTER-ENTITY", self._character_entity)
        self.stateMachine.set_start("PARA")
        self.patterns = {
            'escape': re.compile(r'\\', re.U),
            'escaped-chars': re.compile(r'[\\\(\{\}\[\]_\*,\.\*`"&]', re.U),
            'annotation': re.compile(
                r'(?<!\\)\{(?P<text>.*?)(?<!\\)\}(\(\s*(?P<type>\S*?\s*[^\\"\']?)(["\'](?P<specifically>.*?)["\'])??\s*(\((?P<namespace>\w+)\))?\s*(~(?P<language>[\w-]+))?\))?', re.U),
            'bold': re.compile(r'\*(?P<text>((?<=\\)\*|[^\*])*)(?<!\\)\*', re.U),
            'italic': re.compile(r'_(?P<text>((?<=\\)_|[^_])*)(?<!\\)_', re.U),
            'code': re.compile(r'`(?P<text>(``|[^`])*)`', re.U),
            'quotes': re.compile(r'"(?P<text>((?<=\\)"|[^"])*)(?<!\\)"', re.U),
            'inline-insert': re.compile(r'>\((?P<attributes>.*?)\)', re.U),
            'character-entity': re.compile(r'&(\#[0-9]+|#[xX][0-9a-fA-F]+|[\w]+);'),
            'citation': re.compile(r'(\[\s*\*(?P<id>\S+)(\s+(?P<id_extra>.+?))?\])|(\[\s*\#(?P<name_name>\S+)(\s+(?P<extra>.+?))?\])|(\[\s*(?P<citation>.*?)\])', re.U)
        }
def makeconfig(infp, outfp, modules, with_ifdef=0):
	m1 = regex.compile('-- ADDMODULE MARKER 1 --')
	m2 = regex.compile('-- ADDMODULE MARKER 2 --')
	while 1:
		line = infp.readline()
		if not line: break
		outfp.write(line)
		if m1 and m1.search(line) >= 0:
			m1 = None
			for mod in modules:
				if mod in never:
					continue
				if with_ifdef:
					outfp.write("#ifndef init%s\n"%mod)
				outfp.write('extern void init%s();\n' % mod)
				if with_ifdef:
					outfp.write("#endif\n")
		elif m2 and m2.search(line) >= 0:
			m2 = None
			for mod in modules:
				if mod in never:
					continue
				outfp.write('\t{"%s", init%s},\n' %
					    (mod, mod))
	if m1:
		sys.stderr.write('MARKER 1 never found\n')
	elif m2:
		sys.stderr.write('MARKER 2 never found\n')
Beispiel #31
0
 def __init__(self):
     super().__init__()
     self.re_match = re.compile(
         r"""([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"""
     )
class CommonDefinitionPatterns:
    reg_semicolon = re.compile("([\"'“„])(?:(?=(\\\\?))\\2.)*?\\1(?=:)",
                               re.UNICODE | re.IGNORECASE)
    reg_quoted = re.compile("([\"'“„])(?:(?=(\\\\?))\\2.)*?\\1",
                            re.UNICODE | re.IGNORECASE)
    reg_acronyms = re.compile(r"\(\p{Lu}\p{L}*\p{Lu}\)", re.UNICODE)

    @staticmethod
    def match_acronyms(phrase: str) -> List[PatternFound]:
        """
        :param phrase: rompió el silencio tras ser despedido del Canal del Fútbol (CDF).
        :return: {name: 'CDF', probability: 100, ...}
        """
        defs = []
        for match in CommonDefinitionPatterns.reg_acronyms.finditer(phrase):
            acr_start = CommonDefinitionPatterns.get_acronym_words_start(
                phrase, match)
            if acr_start < 0:
                continue
            df = PatternFound()
            df.name = match.group().strip('() ')
            df.start = acr_start
            df.end = match.start() - 1
            df.probability = 100
            defs.append(df)
        return defs

    @staticmethod
    def get_acronym_words_start(phrase: str, match: Match) -> int:
        """
        each acronym match should be preceded by capitalized words that start from the same letters
        :param phrase: "rompió el silencio tras ser despedido del Canal del Fútbol (CDF). "
        :param match: "(CDF)" Match object for this example
        :return: start letter (42 for this case) index or -1
        """
        proc = UniversalDefinitionsParser.basic_line_processor
        name = match.group().strip('() ').upper()
        start = match.start()
        words = proc.split_text_on_words(phrase[:start])
        if len(words) < 2:
            return -1

        mistakes = 0
        uppercases = 0
        acr_index = len(name) - 1
        acr_start = words[-1].start

        for i in range(len(words) - 1, -1, -1):
            if words[i].is_separator:
                continue
            l = words[i].text[0]
            l_upper = l.upper()
            is_upper = l_upper == l
            if is_upper:
                uppercases += 1
            is_correct = name[acr_index] == l_upper
            if not is_correct:
                mistakes += 1
                if mistakes > 1:
                    return -1
                continue
            acr_start = words[i].start
            acr_index -= 1
            if acr_index < 0:
                break
        return acr_start if uppercases > 1 and acr_index < 0 else -1

    @staticmethod
    def match_es_def_by_semicolon(phrase: str) -> List[PatternFound]:
        """
        :param phrase: "Modern anatomy human": a human of modern anatomy.
        :return: {name: 'Modern anatomy human', probability: 100, ...}
        """
        prob = 100
        defs = []

        for match in CommonDefinitionPatterns.reg_semicolon.finditer(phrase):
            df = PatternFound()
            df.name = match.group()
            df.start = 0
            df.end = len(phrase)
            df.probability = prob
            defs.append(df)
            prob = 66

        return defs

    @staticmethod
    def peek_quoted_part(phrase: str, match: Match,
                         start_func: Callable[[str, Match, Match], int],
                         end_func: Callable[[str, Match, Match], int],
                         match_prob: int) -> List[PatternFound]:
        """
        :param phrase: the whole text, may be used for getting the definition's text length
        :param match: the matched part of the phrase that may contain several quote-packed definitions
        :param start_func: (phrase, match, quoted_match) -> definition's start
        :param end_func: (phrase, match, quoted_match) -> definition's end
        :param match_prob: definition's probability
        :return: a list of definitions found or an empty list
        """
        defs = []
        text = match.group()
        quoted_entries = [
            m for m in CommonDefinitionPatterns.reg_quoted.finditer(text)
        ]
        if len(quoted_entries) == 0:
            return defs
        for entry in quoted_entries:
            df = PatternFound()
            df.name = entry.group()
            df.start = start_func(phrase, match, entry)
            df.end = end_func(phrase, match, entry)
            df.probability = match_prob
            defs.append(df)
        return defs

    @staticmethod
    def collect_regex_matches_with_quoted_chunks(
            phrase: str, reg: re, prob: int,
            quoted_def_start: Callable[[str, Match, Match], int],
            quoted_def_end: Callable[[str, Match, Match],
                                     int], def_start: Callable[[str, Match],
                                                               int],
            def_end: Callable[[str, Match], int]) -> List[PatternFound]:
        """
        First, find all matches by 'reg' ptr
        Second, go through matches
        For each match try to find a set of quoted words
        If found, use them as matches
        Or use the whole match
        :param quoted_def_start: (phrase, match, quoted_match) -> definition's start
        :param quoted_def_end: (phrase, match, quoted_match) -> definition's end
        :param def_start: (phrase, match) -> definition's start
        :param def_end: (phrase, match) -> definition's end
        :return:
        """
        defs = []
        for match in reg.finditer(phrase):
            quoted_matches = \
                CommonDefinitionPatterns.peek_quoted_part(phrase,
                                                          match,
                                                          quoted_def_start,
                                                          quoted_def_end,
                                                          prob)
            if len(quoted_matches) > 0:
                defs += quoted_matches
                continue

            df = PatternFound()
            df.name = match.group()
            df.start = def_start(phrase, match)
            df.end = def_end(phrase, match)
            df.probability = prob
            defs.append(df)

        return defs

    @staticmethod
    def collect_regex_matches(
            phrase: str, reg: re, prob: int, def_start: Callable[[str, Match],
                                                                 int],
            def_end: Callable[[str, Match], int]) -> List[PatternFound]:
        """
        find all matches by 'reg' ptr
        :param quoted_def_start: (phrase, match, quoted_match) -> definition's start
        :param quoted_def_end: (phrase, match, quoted_match) -> definition's end
        :param def_start: (phrase, match) -> definition's start
        :param def_end: (phrase, match) -> definition's end
        :return:
        """
        defs = []
        for match in reg.finditer(phrase):

            df = PatternFound()
            df.name = match.group()
            df.start = def_start(phrase, match)
            df.end = def_end(phrase, match)
            df.probability = prob
            defs.append(df)

        return defs
Beispiel #33
0
# -*- coding: utf-8 -*-
"""
Created on Mon May 20 16:41:00 2019

@author: richard.mitanchey
"""

from lxml import etree
import regex as re
import unicodedata
import io
import networkx as nx
from collections import OrderedDict
import json

many = re.compile(r'\.\.\*')

parser = etree.XMLParser()

nsmap = {
    "uml":
    "http://www.omg.org/spec/UML/20110701",
    "xmi":
    "http://www.omg.org/spec/XMI/20110701",
    "thecustomprofile":
    "http://www.sparxsystems.com/profiles/thecustomprofile/1.0",
    "UML_Profile_for_INSPIRE_data_specifications":
    "http://www.sparxsystems.com/profiles/UML_Profile_for_INSPIRE_data_specifications/3.0-2"
}

Beispiel #34
0
# -*- coding:utf-8 -*-
import random, os
import regex as re
from unidecode import unidecode


_punct_re = re.compile(r'[\t !":\!#$%&\'()*\-/<=>?@\[\\\]^_`{|},.]+')

def create_chain(file_paths):
	markov_chain = {}
	word1 = "\n"
	word2 = "\n"
	for path in file_paths:
		with open(path) as file:
			for line in file:
				line = line.strip()
				for current_word in line.split():
					if current_word != "":
						markov_chain.setdefault((word1, word2), []).append(current_word)
						word1 = word2
						word2 = current_word
	return markov_chain

def construct_sentence(markov_chain, word_count=5, slug=False):
	generated_sentence = ""
	word_tuple = random.choice(list(markov_chain.keys()))
	w1 = word_tuple[0]	
	w2 = word_tuple[1]
	
	for i in range(word_count):
		newword = random.choice(markov_chain[(w1, w2)])
Beispiel #35
0
def compile_infix_regex(entries):
    expression = '|'.join([piece for piece in entries if piece.strip()])
    return re.compile(expression)
Beispiel #36
0
 def __init__(self):
     super().__init__()
     self.re_match = re.compile(r"""\b\d+\b""")
Beispiel #37
0
 def __init__(self):
     super().__init__()
     self.re_match = re.compile(r"""[^!"&':;?,\.\w\d ]+""")
Beispiel #38
0
 def __init__(self):
     super().__init__()
     self.re_match = re.compile(r"\S*\d+\S*", re.IGNORECASE)
Beispiel #39
0
 def __init__(self):
     super().__init__()
     self.re_match = re.compile(r"[!¡\"&':;¿?,\.]+")
Beispiel #40
0
 def __init__(self):
     super().__init__()
     self.re_match = re.compile(
         r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""",
         re.IGNORECASE)
Beispiel #41
0
 def __init__(self):
     super().__init__()
     self.re_match = re.compile(
         r"""(\s*(?P<punctuation>[!¡\"&':;¿?,\.]+)){2,}""")
     self.sub = lambda x: x.group("punctuation").strip()[0]
Beispiel #42
0
 def __init__(self):
     super().__init__()
     self.re_match = re.compile(r"[^(\p{L}|\ )]+")
Beispiel #43
0
import regex as re
import numpy as np

INIT_ATOMIC_COORDINATES_RE = re.compile(
    r"""
    \sMODULE\sQUICKSTEP:\s\sATOMIC\sCOORDINATES\sIN\sangstrom\s*\n
    \n
    \s+Atom\s+Kind\s+Element\s+X\s+Y\s+Z\s+Z\(eff\)\s+Mass\s*\n
    (\n)?
    (
        \s+(?P<atom>\d+)
        \s+(?P<kind>\d+)
        \s+(?P<element>\w+)
        \s+\d+
        \s+(?P<x>[\s-]\d+\.\d+)
        \s+(?P<y>[\s-]\d+\.\d+)
        \s+(?P<z>[\s-]\d+\.\d+)
        \s+[\s-]\d+\.\d+
        \s+[\s-]\d+\.\d+
        \n
    )+
    """, re.VERBOSE)


def parse_init_atomic_coordinates(output_file):

    for match in INIT_ATOMIC_COORDINATES_RE.finditer(output_file):
        #print(match)
        # only get the last match
        init_atomic_coordinates = []
        chemical_symbols = []
Beispiel #44
0
 def __init__(self):
     super().__init__()
     self.re_match = re.compile(r"""[\",\\0]""")
Beispiel #45
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import regex as re
from datetime import datetime
from datetime import time

from dateutil.relativedelta import relativedelta
from dateutil.parser import parse

from dateparser.utils import is_dateutil_result_obj_parsed, apply_timezone

_UNITS = r'year|month|week|day|hour|minute|second'
PATTERN = re.compile(r'(\d+)\s*(%s)\b' % _UNITS, re.I | re.S | re.U)


class FreshnessDateDataParser(object):
    """ Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """
    def __init__(self):
        self._now = None

    @property
    def now(self):
        return self._now if self._now else datetime.utcnow()

    @now.setter
    def now(self, value):
        self._now = value

    def _are_all_words_units(self, date_string):
        skip = [_UNITS, r'ago|in|\d+', r':|[ap]m']
Beispiel #46
0
#! /usr/bin/env python
# Update a bunch of files according to a script.
# The input file contains lines of the form <filename>:<lineno>:<text>,
# meaning that the given line of the given file is to be replaced
# by the given text.  This is useful for performing global substitutions
# on grep output:
import os
import sys
import regex
pat = '^\([^: \t\n]+\):\([1-9][0-9]*\):'
prog = regex.compile(pat)

class FileObj:
    def __init__(self, filename):
        self.filename = filename
        self.changed = 0
        try:
            self.lines = open(filename, 'r').readlines()
        except IOError, msg:
            print '*** Can\'t open "%s":' % filename, msg
            self.lines = None
            return
        print 'diffing', self.filename
    def finish(self):
        if not self.changed:
            print 'no changes to', self.filename
            return
        try:
            os.rename(self.filename, self.filename + '~')
            fp = open(self.filename, 'w')
Beispiel #47
0
``b32decode`` function.

"""
from typing import Any
import binascii
import base64
import magic
import regex as re

from katana.unit import Unit as BaseUnit
from katana.unit import NotApplicable
from katana.util import is_good_magic
import katana.util

BASE32_PATTERN = rb"[A-Z2-7+/]+={0,6}"
BASE32_REGEX = re.compile(BASE32_PATTERN,
                          re.MULTILINE | re.DOTALL | re.IGNORECASE)


class Unit(BaseUnit):

    GROUPS = ["raw", "decode", "base32"]
    """
    These are "tags" for a unit. Considering it is a Raw unit, "raw"
    is included, as well as the tag "decode", and the unit name "base32".
    """

    PRIORITY = 60
    """
    Priority works with 0 being the highest priority, and 100 being the 
    lowest priority. 50 is the default priorty. This unit has a low
    priority.
Beispiel #48
0
 def callback(ed):
     import regex
     ed.find_text(regex.compile('A bold word'))
Beispiel #49
0
def validateXbrlFinally(val, *args, **kwargs):
    if not (val.validateHMRCplugin) or not val.txmyType:
        return

    modelXbrl = val.modelXbrl
    modelDocument = modelXbrl.modelDocument

    _statusMsg = _("validating {0} filing rules").format(
        val.disclosureSystem.name)
    modelXbrl.profileActivity()
    modelXbrl.modelManager.showStatus(_statusMsg)

    if modelDocument.type in (ModelDocument.Type.INSTANCE,
                              ModelDocument.Type.INLINEXBRL):
        labelHasNegativeTermPattern = re.compile(r".*[(].*\w.*[)].*")

        companyReferenceNumberContexts = defaultdict(list)
        for c1 in modelXbrl.contexts.values():
            scheme, identifier = c1.entityIdentifier
            if scheme == "http://www.companieshouse.gov.uk/":
                companyReferenceNumberContexts[identifier].append(c1.id)

        uniqueFacts = {}  # key = (qname, context hash, unit hash, lang)
        mandatoryFacts = {}
        mandatoryGDV = defaultdict(set)
        factForConceptContextUnitLangHash = defaultdict(list)
        hasCompaniesHouseContext = any(
            cntx.entityIdentifier[0] == "http://www.companieshouse.gov.uk/"
            for cntx in val.modelXbrl.contexts.values())

        contextsUsed = set(f.context for f in modelXbrl.factsInInstance
                           if f.context is not None)

        for cntx in contextsUsed:
            for dim in cntx.qnameDims.values():
                if dim.isExplicit:
                    _memName = dim.memberQname.localName
                    m = memNameNumPattern.match(_memName)
                    if m:
                        l = m.group(1)
                        n = int(m.group(2))
                    else:
                        l = _memName
                        n = None
                    for _gdvType in (val.txmyType, "business"):
                        gdv = genericDimensionValidation.get(
                            _gdvType, EMPTYDICT).get(l)
                        if gdv:  # take first match
                            break
                    if (gdv and
                        (n is None or
                         (isinstance(gdv[0], int) and isinstance(gdv[1], int)
                          and n >= gdv[0] and n <= gdv[1]))):
                        gdvFacts = [f for f in gdv if isinstance(f, str)]
                        if len(gdvFacts) == 1:
                            mandatoryGDV[gdvFacts[0]].add(
                                GDV(gdvFacts[0], None, _memName))
                        elif len(gdvFacts) == 2:
                            mandatoryGDV[gdvFacts[0]].add(
                                GDV(gdvFacts[0], gdvFacts[1], _memName))
                            mandatoryGDV[gdvFacts[1]].add(
                                GDV(gdvFacts[1], gdvFacts[0], _memName))

        def checkFacts(facts):
            for f in facts:
                cntx = f.context
                unit = f.unit
                if getattr(
                        f, "xValid",
                        0) >= 4 and cntx is not None and f.concept is not None:
                    factNamespaceURI = f.qname.namespaceURI
                    factLocalName = f.qname.localName
                    if factLocalName in mandatoryItems[val.txmyType]:
                        mandatoryFacts[factLocalName] = f
                    if factLocalName == "UKCompaniesHouseRegisteredNumber" and val.isAccounts:
                        if hasCompaniesHouseContext:
                            mandatoryFacts[factLocalName] = f
                        for _cntx in contextsUsed:
                            _scheme, _identifier = _cntx.entityIdentifier
                            if _scheme == "http://www.companieshouse.gov.uk/" and f.xValue != _identifier:
                                modelXbrl.error(
                                    "JFCVC.3316",
                                    _("Context entity identifier %(identifier)s does not match Company Reference Number (UKCompaniesHouseRegisteredNumber) Location: Accounts (context id %(id)s)"
                                      ),
                                    modelObject=(f, _cntx),
                                    identifier=_identifier,
                                    id=_cntx.id)
                    if not f.isNil:
                        factForConceptContextUnitLangHash[
                            f.conceptContextUnitLangHash].append(f)

                    if f.isNumeric:
                        if f.precision:
                            modelXbrl.error(
                                "HMRC.5.4",
                                _("Numeric fact %(fact)s of context %(contextID)s has a precision attribute '%(precision)s'"
                                  ),
                                modelObject=f,
                                fact=f.qname,
                                contextID=f.contextID,
                                precision=f.precision)
                        try:  # only process validated facts
                            if f.xValue < 0:
                                label = f.concept.label(lang="en")
                                if not labelHasNegativeTermPattern.match(
                                        label):
                                    modelXbrl.error(
                                        "HMRC.5.3",
                                        _("Numeric fact %(fact)s of context %(contextID)s has a negative value '%(value)s' but label does not have a bracketed negative term (using parentheses): %(label)s"
                                          ),
                                        modelObject=f,
                                        fact=f.qname,
                                        contextID=f.contextID,
                                        value=f.value,
                                        label=label)
                        except AttributeError:
                            pass  # if not validated it should have failed with a schema error

                    # check GDV
                    if f.qname.localName in mandatoryGDV:
                        _gdvReqList = mandatoryGDV[factLocalName]
                        _gdvReqRemovals = []
                        for _gdvReq in _gdvReqList:
                            if any(_gdvReq.memLocalName ==
                                   dim.memberQname.localName
                                   for dim in cntx.qnameDims.values()
                                   if dim.isExplicit):
                                _gdvReqRemovals.append(_gdvReq)
                                if _gdvReq.altFact in mandatoryGDV:
                                    _gdvAltList = mandatoryGDV[_gdvReq.altFact]
                                    _gdvAltRemovals = []
                                    for _gdvAlt in _gdvAltList:
                                        if any(_gdvAlt.memLocalName ==
                                               dim.memberQname.localName for
                                               dim in cntx.qnameDims.values()
                                               if dim.isExplicit):
                                            _gdvAltRemovals.append(_gdvAlt)
                                    for _gdvAlt in _gdvAltRemovals:
                                        _gdvAltList.remove(_gdvAlt)
                        if _gdvReqRemovals and not f.xValue:  # fact was a mandatory name or description
                            modelXbrl.error(
                                "JFCVC.3315",
                                _("Generic dimension members associated name/description has no text: %(fact)s"
                                  ),
                                modelObject=f,
                                fact=f.qname)
                        for _gdvReq in _gdvReqRemovals:
                            _gdvReqList.remove(_gdvReq)

                    if f.modelTupleFacts:
                        checkFacts(f.modelTupleFacts)

        checkFacts(modelXbrl.facts)

        if val.isAccounts:
            _missingItems = mandatoryItems[
                val.txmyType] - mandatoryFacts.keys()
            if hasCompaniesHouseContext and "UKCompaniesHouseRegisteredNumber" not in mandatoryFacts:
                _missingItems.add("UKCompaniesHouseRegisteredNumber")
            if _missingItems:
                modelXbrl.error("JFCVC.3312",
                                _("Mandatory facts missing: %(missingItems)s"),
                                modelObject=modelXbrl,
                                missingItems=", ".join(_missingItems))

            f = mandatoryFacts.get("StartDateForPeriodCoveredByReport")
            if f is not None and f.xValue < _6_APR_2008:
                modelXbrl.error(
                    "JFCVC.3313",
                    _("Period Start Date (StartDateForPeriodCoveredByReport) must be 6 April 2008 or later, but is %(value)s"
                      ),
                    modelObject=f,
                    value=f.value)

            memLocalNamesMissing = set(
                "{}({})".format(_gdvRec.memLocalName, _gdvRec.factNames)
                for _gdv in mandatoryGDV.values() for _gdvRec in _gdv)
            if memLocalNamesMissing:
                modelXbrl.error(
                    "JFCVC.3315",
                    _("Generic dimension members have no associated name or description item, member names (name or description item): %(memberNames)s"
                      ),
                    modelObject=modelXbrl,
                    memberNames=", ".join(sorted(memLocalNamesMissing)))

        aspectEqualFacts = defaultdict(list)
        for hashEquivalentFacts in factForConceptContextUnitLangHash.values():
            if len(hashEquivalentFacts) > 1:
                for f in hashEquivalentFacts:
                    aspectEqualFacts[(f.qname, f.contextID, f.unitID,
                                      f.xmlLang)].append(f)
                for fList in aspectEqualFacts.values():
                    f0 = fList[0]
                    if any(not f.isVEqualTo(f0) for f in fList[1:]):
                        modelXbrl.error(
                            "JFCVC.3314",
                            "Inconsistent duplicate fact values %(fact)s: %(values)s.",
                            modelObject=fList,
                            fact=f0.qname,
                            contextID=f0.contextID,
                            values=", ".join(f.value for f in fList))
                aspectEqualFacts.clear()
        del factForConceptContextUnitLangHash, aspectEqualFacts

    modelXbrl.profileActivity(_statusMsg, minTimeToShow=0.0)
    modelXbrl.modelManager.showStatus(None)
Beispiel #50
0
import math
import multiprocessing as mp
import numpy as np
import os
import pandas as pd
import pydash as _
import regex as re
import torch
import ujson
import yaml
import pprint

NUM_CPUS = mp.cpu_count()
DF_FILE_EXT = ['.csv', '.xlsx', '.xls']
FILE_TS_FORMAT = '%Y_%m_%d_%H%M%S'
RE_FILE_TS = re.compile(r'(\d{4}_\d{2}_\d{2}_\d{6})')
RE_INDENT = re.compile('(^\n)|(?!\n)\s{2,}|(\n\s+)$')
SPACE_PATH = ['agent', 'agent_space', 'aeb_space', 'env_space', 'env']


class LabJsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return str(obj)
Beispiel #51
0
 def _generate_as_number_regex(self, as_numbers):
     """Generate regex for finding AS number."""
     # Match a non-digit, any of the AS numbers and another non-digit
     # Using lookahead and lookbehind to match on context but not include that context in the match
     self.as_num_regex = regex.compile('(\D|^)\K(' + '|'.join(as_numbers) +
                                       ')(?=\D|$)')
Beispiel #52
0
# -*- coding: utf-8 -*-
# @Date     : 2020-03-23
# @Author   : zhaoguocai
# @QQ       : 30516864
# @使用交流群 :685096991

a = r'(\$\w+\((?>[^()]+|(?R))*\))'

import time
start_time = time.time()
print(1111)
end_time = time.time()
print(end_time - start_time)

import regex

s = '$a1($a2($a3($a99("dsd")),a4,$a5(1,2),a6),a7,$a8($a9()))'
keyword_regex = regex.compile(r'(\$\w+\((?>[^()]+|(?R))*\))')
keyword_list = regex.findall(keyword_regex, s)
print(keyword_list)
Beispiel #53
0
 def _generate_sensitive_word_regex(cls, sensitive_words):
     """Compile and return regex for the specified list of sensitive words."""
     return regex.compile('({})'.format('|'.join(sensitive_words)),
                          regex.IGNORECASE)
Beispiel #54
0
(c) Copyright 2013 Mark V Systems Limited, All rights reserved.

References:
  https://xbrl.frc.org.uk (taxonomies, filing requirements, consistency checks)
  https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/434597/joint-filing-validation-checks.pdf
'''
import os
from arelle import ModelDocument, XmlUtil
from arelle.ModelValue import qname, dateTime, DATE
try:
    import regex as re
except ImportError:
    import re
from collections import defaultdict

memNameNumPattern = re.compile(r"^([A-Za-z-]+)([0-9]+)$")
compTxmyNamespacePattern = re.compile(
    r"http://www.govtalk.gov.uk/uk/fr/tax/uk-hmrc-ct/[0-9-]{10}")
EMPTYDICT = {}
_6_APR_2008 = dateTime("2008-04-06", type=DATE)

commonMandatoryItems = {
    "EntityCurrentLegalOrRegisteredName", "StartDateForPeriodCoveredByReport",
    "EndDateForPeriodCoveredByReport", "BalanceSheetDate"
}
mandatoryItems = {
    "ukGAAP": commonMandatoryItems | {
        "DateApprovalAccounts", "NameDirectorSigningAccounts", "EntityDormant",
        "EntityTrading", "DateSigningDirectorsReport", "DirectorSigningReport"
    },
    "charities": commonMandatoryItems | {
Beispiel #55
0
 def __init__(self, patterns: List[str]) -> None:
     self.patterns = patterns
     self.joined_patterns = re.compile("|".join(self.patterns))
Beispiel #56
0
def generate_default_sensitive_item_regexes():
    """Compile and return the default password and community line regexes."""
    combined_regexes = default_pwd_line_regexes + default_com_line_regexes + \
        extra_password_regexes
    return [[(regex.compile(_ALLOWED_REGEX_PREFIX + regex_), num)
             for regex_, num in group] for group in combined_regexes]
Beispiel #57
0
	def testDependenciesAppearInTheSourceBeforeFilesThatRequiredThem(self):
		''' Test dependencies appear in the source before files that required them '''

		self.assertRegexpMatches(str(self.get_asset('application.js')),re.compile(r"""Project.+Users.+focus""",re.M|re.S))
Beispiel #58
0
    def start(self):
        self.setVisible(True)
        self.pi.start()
        self.is_running = True

    def stop(self):
        self.setVisible(False)
        self.pi.stop()
        self.is_running = False


# }}}

quote_map = {'"': '"“”', "'": "'‘’"}
qpat = regex.compile(r'''(['"])''')
spat = regex.compile(r'(\s+)')
invisible_chars = '(?:[\u00ad\u200c\u200d]{0,1})'
SEARCH_RESULT_ROLE = Qt.ItemDataRole.UserRole
RESULT_NUMBER_ROLE = SEARCH_RESULT_ROLE + 1
SPINE_IDX_ROLE = RESULT_NUMBER_ROLE + 1


def text_to_regex(text):
    has_leading = text.lstrip() != text
    has_trailing = text.rstrip() != text
    if text and not text.strip():
        return r'\s+'
    ans = []
    for wpart in spat.split(text.strip()):
        if not wpart.strip():
Beispiel #59
0
Bracket = '[][(){}]'
Special = group(r'\r?\n', r'[:;.,`@]')
Funny = group(Operator, Bracket, Special)

PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken

# First (or only) line of ' or " string.
ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                group("'", r'\\\r?\n'),
                _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

tokenprog = re.compile(Token, re.UNICODE)
pseudoprog = re.compile(PseudoToken, re.UNICODE)
single3prog = re.compile(Single3)
double3prog = re.compile(Double3)

_strprefixes = (
    _combinations('r', 'R', 'f', 'F') |
    _combinations('r', 'R', 'b', 'B') |
    {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
)

endprogs = {"'": re.compile(Single), '"': re.compile(Double),
            "'''": single3prog, '"""': double3prog,
            **{f"{prefix}'''": single3prog for prefix in _strprefixes},
            **{f'{prefix}"""': double3prog for prefix in _strprefixes},
            **{prefix: None for prefix in _strprefixes}}
Beispiel #60
0
 def _compile_regexes(self):
     for key, value in self.patterns.items():
         self.patterns[key] = regex.compile(value)