def is_message_id(self, match: re.match, msg: str) -> bool: start = match.start() email = match.group() # Note that our regex will match thigs like "message-id=Issue1649523226559@postfix-mail.mail-system.svc.cluster.local" # so we need to filter / check for these first if email.startswith(self.MESSAGE_ID_LINE): return True if start >= self.MESSAGE_ID_LINE_LEN: pos = start - 1 while True: char = msg[pos] if char == '=': break elif char in '{<["\'': pos = pos - 1 continue return False check = msg[pos - self.MESSAGE_ID_LINE_LEN + 1:pos + 1] if check == self.MESSAGE_ID_LINE: return True return False
def helper(match: re.match): alias = match.groupdict()["alias"] if alias.upper() in self.reserved_keywords or alias.upper() in ( "ASC", "DESC"): # List of Presto reserved keywords return match.group( ) # Not an alias but the continuation of the SQL logic else: return match.group()[:-len(alias)] + "as " + alias
def match_to_free(line: int, m: re.match) -> Free: """Given a match that has groups: ('77633', '__init__.py', '422', 'validate', '0x7fca858e4200') this returns a Malloc() object.""" return Free( line, int(m.group(1)), m.group(2), int(m.group(3)), m.group(4), int(m.group(5), 16), )
def match_to_malloc(line: int, m: re.match) -> Malloc: """Given a match that has groups: ('77633', 'cmn_cmd_opts.py', '141', 'set_log_level', '560', '0x7fca83ef4240') this returns a Malloc() object.""" return Malloc( line, int(m.group(1)), m.group(2), int(m.group(3)), m.group(4), int(m.group(5)), int(m.group(6), 16), )
def _makeRange(m: re.match, s: int, e: int, i: int) -> List[float]: # For a pythonesque ranging: #list(arange(float(m.group(s)),float(m.group(e)),float(m.group(i)))) # BUT probably more expected ranging includes end point... # Check if there is no interval and ensure minimal return is start tol = 1e-3 # Attempt to allow for just missing the end sign = lambda x: copysign(1, x) (vs, ve, vi) = (float(m.group(s)), float(m.group(e)), float(m.group(i))) # Tolerate the interval being the wrong way round for the increment if vi == 0.0 or sign(vi) * (ve - vs) <= 0: # arange will now be ok return [vs] return list(arange(vs, ve + (ve - vs) * tol, vi))
def literal_string_decode(match: re.match) -> str: """ url_decode, but as a response for re.sub. Intended to be used with pattern['string'], as it expects \\1 to be the quote kind and \\2 its contents. It produces single-quote strings when possible, but when the string contains a \n character, is converted to triple quotes. It scapes all quotes and activates all escape sequences. >>> import re >>> re.sub(r"(')([\s\S]*)\\1",literal_string_decode,"asdasd'YXNkCmFzZA=='asdasd") "asdasd'''asd\\nasd'''asdasd" """ string = special_activate(quote_escape(url_decode(match.group(2)))) kind = "'''" if "\n" in string else match.group(1) return kind + string + kind
def get_context(self, document: str, match_found: re.match, context_config: dict): """ This method gets the context around a found match in the document in accordance with the context configuration :param document: The document to be scanned :param match_found: The match that is to be used as the center of the context window :param context_config: The context configuration :return: A string containing the context around the found match (Can parameterize later to return str or list!) """ match_str = document[match_found.start():match_found.end()].strip() preceding_text = document[:match_found.start()] succeeding_text = document[match_found.end():] if context_config['type'] == ContextType.WORD: preceding_text_words = self.trim_boundaries( re.split(r'\s+', preceding_text)) succeeding_text_words = self.trim_boundaries( re.split(r'\s+', succeeding_text)) return match_str, ' '.join( preceding_text_words[len(preceding_text_words) - context_config['size']:] + ['TARGETWORD'] + succeeding_text_words[:context_config['size']]) if context_config['type'] == ContextType.PARAGRAPH: preceding_text_lines = self.trim_boundaries( re.split(self.split_lines_regex, preceding_text)) succeeding_text_lines = self.trim_boundaries( re.split(self.split_lines_regex, succeeding_text)) preceding_text_empty_line_indices = [ index for index, item in enumerate(preceding_text_lines) if len(item.strip()) < 1 ] succeeding_text_empty_line_indices = [ index for index, item in enumerate(succeeding_text_lines) if len(item.strip()) < 1 ] if not preceding_text_empty_line_indices: preceding_text_empty_line_indices = [-1] if not succeeding_text_empty_line_indices: succeeding_text_empty_line_indices = [ len(succeeding_text_lines) ] return ' '.join( preceding_text_lines[preceding_text_empty_line_indices[ len(preceding_text_empty_line_indices) - context_config['size'] if context_config['size'] < len( preceding_text_empty_line_indices) else -1] + 1:] + ['TARGETWORD'] + succeeding_text_lines[:succeeding_text_empty_line_indices[ context_config['size'] - 1 if context_config['size'] < len( succeeding_text_empty_line_indices) else -1]])
def replace(self, match: re.match, msg: str) -> str: email = match.group() # Return the details unchanged if they look like Postfix message ID if self.is_message_id(match, msg): return email if not self.case_sensitive: email = email.lower() if self.split: # The "@" can show up in the local part, but shouldn't appear in the # domain part (at least not that we know). local, domain = email.rsplit("@", 1) local = hmac.new(self.salt_encoded, local.encode(), hashlib.sha256).hexdigest() domain = hmac.new(self.salt_encoded, domain.encode(), hashlib.sha256).hexdigest() if self.short_sha: local = local[:8] domain = domain[:8] email = local + "@" + domain else: email = hmac.new(self.salt_encoded, email.encode(), hashlib.sha256).hexdigest() if self.short_sha: email = email[:8] return self.prefix + email + self.suffix
def _cast_in_subquery(self, sql: str, result: re.match, **kwargs) -> str: """Value & subqueries results must have the same data type. Example: select 1 in (select '1') --> line 1:10: value and result of subquery must be of the same type for IN expression: integer vs varchar (1) Fix: select cast(1 AS varchar) in (select '1') Args: sql (str): SQL to fix result (re.match): Regex match object Returns: str: Fixed SQL """ line, column = int(result["line"]) - 1, int( result["column"]) - 1 # Presto to python array notation token, idx = self.ColumnCaster.get_problematic_token( sql, line, column) # Should return an identifier list or identifier logging.debug( f"[DEBUG] Found token {[token]}\nvalue:{token}|ttype:{token.ttype}\nparent:{token.parent}" ) return self.ColumnCaster.cast_non_trivial_tokens( sql, token, idx, result["f_type_0"], result.groupdict(), count_forward_tokens=0)
def _subHtmlTag(match: re.match) -> str: """ Determines whether to replace the tag with a space or to just remove it. """ startIndex, endIndex = match.span() return "" if ( startIndex == 0 or match.string[startIndex - 1].isspace() or endIndex == len(match.string) or match.string[endIndex].isspace() ) else " "
def _cast_in(self, sql: str, result: re.match, **kwargs) -> str: """Value & IN statement must have the same type of data type. Example: select a \nfrom cte\nwhere a in ( \n 1, 2, 3) --> line 4:2: IN value and list items must be the same type: bigint (1) Fix: select a \nfrom cte\nwhere cast(a AS bigint) in ( \n 1, 2, 3) Args: sql (str): SQL to fix result (re.match): Regex match object Raises: ValueError: Data type in IN statement are inconsistent. It is not clear which data type should be chosen. Returns: str: Fixed SQL """ line, column = int(result["line"]) - 1, int( result["column"]) - 1 # Presto to python array notation token, idx = self.ColumnCaster.get_problematic_token( sql, line, column) # Should return an identifier list or identifier if isinstance(token, IdentifierList): token_in_list = [ t for t in token.tokens if t.ttype not in (Whitespace, Punctuation, Comment, Newline) ] else: token_in_list = [token] if all(t.ttype == Literal.String.Single for t in token_in_list): cast_to = "varchar" elif all(t.ttype == Literal.Number.Integer for t in token_in_list): cast_to = "bigint" elif all(t.ttype == Literal.Number.Float for t in token_in_list): cast_to = "double" else: raise ValueError( f"Inconsistent data type in the IN list! {[token_in_list]}") parent_idx = token.parent.tokens.index(token) for preceding_token in token.parent.tokens[ parent_idx - 1::-1]: # Subtract the length of all preceding siblings idx -= len(preceding_token.value) grand_parent_idx = token.parent.parent.tokens.index( token.parent ) # Move up to the grand parent token. See tests to understand why this is relevant for preceding_token in token.parent.parent.tokens[grand_parent_idx - 1::-1]: idx -= len(preceding_token.value) if preceding_token.ttype == Keyword and preceding_token.value.lower( ) == "in": token = preceding_token break return self.ColumnCaster.cast_non_trivial_tokens( sql, token, idx, cast_to, result.groupdict(), count_forward_tokens=0)
def escape_char(m: re.match) -> str: c = m.group() if c == '/': return '--' elif c == '-': return '-m' else: return '-' + hex(ord(c))[2:]
def get_group_str(self, match: re.match = None, group_name: str = ''): group_str = '' try: group_str = match.group(group_name) except IndexError: pass return group_str
def from_gcov_match(cls, match: re.match): (coverage_str, linenum_str, source) = match.groups() if coverage_str == '-': coverage = -1 elif coverage_str in ('#####', '====='): coverage = 0 else: coverage = int(coverage_str) return cls(int(linenum_str), source, coverage)
def format(self, path: Path, match: re.match): """ Format the path with the result of the matching. Only replace what was captured. """ assert match is not None # get what is before and after the capture prefix = match.string[:match.start()] suffix = match.string[match.end():] updated_name = file_formatter.format( self.renamer, None, *match.groups(), **match.groupdict()) return self.untouched_root(path) / Path(prefix + updated_name + suffix)
def free_key_quote(match: re.match) -> str: ''' Response to re.sub, returns a quoted free-key if \\2 is a valid python identifier. \\1 is assumed to be the indentation level. String key is encoded. >>> import re >>> re.sub(r"(\\t*)([^' \\n,\{]+)(?= *:)",free_key_quote,"\\ta:4") "\\t'YQ==':4" >>> re.sub(r"(\\t*)([^' \\n,\{]+)(?= *:)",free_key_quote,"\\t'a':4") "\\t'a':4" ''' level = match.group(1) key = match.group(2) if is_raw_key(key): return level + key elif key.isidentifier(): return f"{level}'{url_encode(key)}'"
def change(match: re.match): num = match.group() if num[0] == '.': return '0' + num if num[-1] == '.': return num + '0' return num
def _wrap_date_match(order: str, match: re.match, pattern: str=None) -> dict or None: """ Args: order: enums['MDY', 'DMY', 'YMD'] - order of the date match: re.match - a regex match object pattern: str - if user defined the pattern, record it here Returns: """ return { 'value': match.group(), 'groups': match.groups(), 'start': match.start(), 'end': match.end(), 'order': order, 'pattern': pattern } if match else None
def lowercasesircumflexsub(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'ᾷ', 'H': u'ῇ', 'W': u'ῷ', } substitute = substitutions[val] return substitute
def lowercaseacutedsub(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'ᾴ', 'H': u'ῄ', 'W': u'ῴ', } substitute = substitutions[val] return substitute
def lowercaseroughsub(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'ᾁ', 'H': u'ᾑ', 'W': u'ᾡ', } substitute = substitutions[val] return substitute
def _match_obj_to_date(m: re.match) -> Optional[date]: """ Convert a successful regex match to a datetime.date object Called by: lines_in_weeks_out() """ if m: # group(3) is the year, group(1) is the month, group(2) is the day dt = [int(m.group(x)) for x in (3, 1, 2)] return datetime.date(dt[0], dt[1], dt[2]) # year, month, day else: return None
def lowercasesmoothsub(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'ᾀ', 'H': u'ᾐ', 'W': u'ᾠ', } substitute = substitutions[val] return substitute
def lowercasegravesub(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'ᾲ', 'H': u'ῂ', 'W': u'ῲ', } substitute = substitutions[val] return substitute
def replace(self, match: re.match, msg: str) -> str: email = match.group() # Return the details unchanged if they look like Postfix message ID if self.is_message_id(match, msg): return email # The "@" can show up in the local part, but shouldn't appear in the # domain part (at least not that we know). local, domain = email.rsplit("@", 1) local = self.mask_local(local) domain = self.mask_domain(domain) return local + '@' + domain
def lowercaseroughcircumflex(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'ἇ', 'E': u'ἑ͂', # IG: TE=S BOLE=S E(=I 'I': u'ἷ', 'O': u'ὁ͂', # IG: PE]RI\ DE\ O(=[N !]DIK 'U': u'ὗ', 'H': u'ἧ', 'W': u'ὧ', } substitute = substitutions[val] return substitute
def lowercaseacute(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'ά', 'E': u'έ', 'I': u'ί', 'O': u'ό', 'U': u'ύ', 'H': u'ή', 'W': u'ώ', } substitute = substitutions[val] return substitute
def lowercasegrave(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'ὰ', 'E': u'ὲ', 'I': u'ὶ', 'O': u'ὸ', 'U': u'ὺ', 'H': u'ὴ', 'W': u'ὼ', } substitute = substitutions[val] return substitute
def lowercasesircumflexdiaresis(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'', 'E': u'', 'I': u'', 'O': u'', 'U': u'ῧ', 'H': u'', 'W': u'', } substitute = substitutions[val] return substitute
def capitalsmoothgraveadscript(match: re.match, g=1) -> str: val = match.group(g) substitutions = { 'A': u'ᾊ', 'E': u'', 'I': u'', 'O': u'', 'U': u'', 'H': u'ᾚ', 'W': u'ᾪ', } substitute = substitutions[val] return substitute
def from_gcov_match(cls, linenum: int, match: re.match): (name, called_str, returned_str, blocks_str) = match.groups() return cls(name, linenum, int(called_str), int(returned_str), int(blocks_str))
def from_gcov_match(cls, match: re.match): count = int(match.group('count') or '0') id_ = int(match.group('id')) type_ = match.group('type') info = match.group('info') return cls(count, id_, type_, info)