def is_message_id(self, match: re.match, msg: str) -> bool:
        start = match.start()
        email = match.group()

        # Note that our regex will match thigs like "message-id=Issue1649523226559@postfix-mail.mail-system.svc.cluster.local"
        # so we need to filter / check for these first

        if email.startswith(self.MESSAGE_ID_LINE):
            return True

        if start >= self.MESSAGE_ID_LINE_LEN:
            pos = start - 1
            while True:
                char = msg[pos]
                if char == '=':
                    break
                elif char in '{<["\'':
                    pos = pos - 1
                    continue

                return False

            check = msg[pos - self.MESSAGE_ID_LINE_LEN + 1:pos + 1]
            if check == self.MESSAGE_ID_LINE:
                return True

        return False
 def helper(match: re.match):
     alias = match.groupdict()["alias"]
     if alias.upper() in self.reserved_keywords or alias.upper() in (
             "ASC", "DESC"):  # List of Presto reserved keywords
         return match.group(
         )  # Not an alias but the continuation of the SQL logic
     else:
         return match.group()[:-len(alias)] + "as " + alias
Exemple #3
0
def match_to_free(line: int, m: re.match) -> Free:
    """Given a match that has groups: ('77633', '__init__.py', '422', 'validate', '0x7fca858e4200')
    this returns a Malloc() object."""
    return Free(
        line,
        int(m.group(1)),
        m.group(2),
        int(m.group(3)),
        m.group(4),
        int(m.group(5), 16),
    )
Exemple #4
0
def match_to_malloc(line: int, m: re.match) -> Malloc:
    """Given a match that has groups: ('77633', 'cmn_cmd_opts.py', '141', 'set_log_level', '560', '0x7fca83ef4240')
    this returns a Malloc() object."""
    return Malloc(
        line,
        int(m.group(1)),
        m.group(2),
        int(m.group(3)),
        m.group(4),
        int(m.group(5)),
        int(m.group(6), 16),
    )
Exemple #5
0
    def _makeRange(m: re.match, s: int, e: int, i: int) -> List[float]:
        # For a pythonesque ranging:
        #list(arange(float(m.group(s)),float(m.group(e)),float(m.group(i))))
        # BUT probably more expected ranging includes end point...

        # Check if there is no interval and ensure minimal return is start
        tol = 1e-3  # Attempt to allow for just missing the end
        sign = lambda x: copysign(1, x)
        (vs, ve, vi) = (float(m.group(s)), float(m.group(e)),
                        float(m.group(i)))
        # Tolerate the interval being the wrong way round for the increment
        if vi == 0.0 or sign(vi) * (ve - vs) <= 0:  # arange will now be ok
            return [vs]
        return list(arange(vs, ve + (ve - vs) * tol, vi))
Exemple #6
0
def literal_string_decode(match: re.match) -> str:
    """
	url_decode, but as a response for re.sub.
	Intended to be used with pattern['string'], as it expects \\1 to be the quote kind and \\2 its contents.
	It produces single-quote strings when possible, but when the string contains a \n character, is converted to triple quotes.
	It scapes all quotes and activates all escape sequences.
	
	>>> import re
	>>> re.sub(r"(')([\s\S]*)\\1",literal_string_decode,"asdasd'YXNkCmFzZA=='asdasd")
	"asdasd'''asd\\nasd'''asdasd"
	"""
    string = special_activate(quote_escape(url_decode(match.group(2))))
    kind = "'''" if "\n" in string else match.group(1)
    return kind + string + kind
Exemple #7
0
 def get_context(self, document: str, match_found: re.match,
                 context_config: dict):
     """
     This method gets the context around a found match in the document in accordance with the context configuration
     :param document: The document to be scanned
     :param match_found: The match that is to be used as the center of the context window
     :param context_config: The context configuration
     :return: A string containing the context around the found match (Can parameterize later to return str or list!)
     """
     match_str = document[match_found.start():match_found.end()].strip()
     preceding_text = document[:match_found.start()]
     succeeding_text = document[match_found.end():]
     if context_config['type'] == ContextType.WORD:
         preceding_text_words = self.trim_boundaries(
             re.split(r'\s+', preceding_text))
         succeeding_text_words = self.trim_boundaries(
             re.split(r'\s+', succeeding_text))
         return match_str, ' '.join(
             preceding_text_words[len(preceding_text_words) -
                                  context_config['size']:] +
             ['TARGETWORD'] +
             succeeding_text_words[:context_config['size']])
     if context_config['type'] == ContextType.PARAGRAPH:
         preceding_text_lines = self.trim_boundaries(
             re.split(self.split_lines_regex, preceding_text))
         succeeding_text_lines = self.trim_boundaries(
             re.split(self.split_lines_regex, succeeding_text))
         preceding_text_empty_line_indices = [
             index for index, item in enumerate(preceding_text_lines)
             if len(item.strip()) < 1
         ]
         succeeding_text_empty_line_indices = [
             index for index, item in enumerate(succeeding_text_lines)
             if len(item.strip()) < 1
         ]
         if not preceding_text_empty_line_indices:
             preceding_text_empty_line_indices = [-1]
         if not succeeding_text_empty_line_indices:
             succeeding_text_empty_line_indices = [
                 len(succeeding_text_lines)
             ]
         return ' '.join(
             preceding_text_lines[preceding_text_empty_line_indices[
                 len(preceding_text_empty_line_indices) -
                 context_config['size'] if context_config['size'] < len(
                     preceding_text_empty_line_indices) else -1] + 1:] +
             ['TARGETWORD'] +
             succeeding_text_lines[:succeeding_text_empty_line_indices[
                 context_config['size'] - 1 if context_config['size'] < len(
                     succeeding_text_empty_line_indices) else -1]])
    def replace(self, match: re.match, msg: str) -> str:
        email = match.group()

        # Return the details unchanged if they look like Postfix message ID
        if self.is_message_id(match, msg):
            return email

        if not self.case_sensitive:
            email = email.lower()

        if self.split:
            # The "@" can show up in the local part, but shouldn't appear in the
            # domain part (at least not that we know).
            local, domain = email.rsplit("@", 1)

            local = hmac.new(self.salt_encoded, local.encode(),
                             hashlib.sha256).hexdigest()
            domain = hmac.new(self.salt_encoded, domain.encode(),
                              hashlib.sha256).hexdigest()

            if self.short_sha:
                local = local[:8]
                domain = domain[:8]

            email = local + "@" + domain

        else:
            email = hmac.new(self.salt_encoded, email.encode(),
                             hashlib.sha256).hexdigest()
            if self.short_sha:
                email = email[:8]

        return self.prefix + email + self.suffix
Exemple #9
0
    def _cast_in_subquery(self, sql: str, result: re.match, **kwargs) -> str:
        """Value & subqueries results must have the same data type.
        Example: select 1 in (select '1') --> line 1:10: value and result of subquery must be of the same type for IN expression: integer vs varchar (1)
        Fix: select cast(1 AS varchar) in (select '1')

        Args:
            sql (str): SQL to fix
            result (re.match): Regex match object

        Returns:
            str: Fixed SQL
        """
        line, column = int(result["line"]) - 1, int(
            result["column"]) - 1  # Presto to python array notation
        token, idx = self.ColumnCaster.get_problematic_token(
            sql, line,
            column)  # Should return an identifier list or identifier
        logging.debug(
            f"[DEBUG] Found token {[token]}\nvalue:{token}|ttype:{token.ttype}\nparent:{token.parent}"
        )
        return self.ColumnCaster.cast_non_trivial_tokens(
            sql,
            token,
            idx,
            result["f_type_0"],
            result.groupdict(),
            count_forward_tokens=0)
Exemple #10
0
def _subHtmlTag(match: re.match) -> str:
	""" Determines whether to replace the tag with a space or to just remove it. """
	startIndex, endIndex = match.span()
	return "" if (
		startIndex == 0 or match.string[startIndex - 1].isspace()
		or endIndex == len(match.string) or match.string[endIndex].isspace()
	) else " "
Exemple #11
0
    def _cast_in(self, sql: str, result: re.match, **kwargs) -> str:
        """Value & IN statement must have the same type of data type.
        Example: select a \nfrom cte\nwhere a in (  \n 1, 2, 3) --> line 4:2: IN value and list items must be the same type: bigint (1)
        Fix: select a \nfrom cte\nwhere cast(a AS bigint) in (  \n 1, 2, 3)

        Args:
            sql (str): SQL to fix
            result (re.match): Regex match object

        Raises:
            ValueError: Data type in IN statement are inconsistent. It is not clear which data type
            should be chosen.

        Returns:
            str: Fixed SQL
        """
        line, column = int(result["line"]) - 1, int(
            result["column"]) - 1  # Presto to python array notation
        token, idx = self.ColumnCaster.get_problematic_token(
            sql, line,
            column)  # Should return an identifier list or identifier
        if isinstance(token, IdentifierList):
            token_in_list = [
                t for t in token.tokens
                if t.ttype not in (Whitespace, Punctuation, Comment, Newline)
            ]
        else:
            token_in_list = [token]
        if all(t.ttype == Literal.String.Single for t in token_in_list):
            cast_to = "varchar"
        elif all(t.ttype == Literal.Number.Integer for t in token_in_list):
            cast_to = "bigint"
        elif all(t.ttype == Literal.Number.Float for t in token_in_list):
            cast_to = "double"
        else:
            raise ValueError(
                f"Inconsistent data type in the IN list! {[token_in_list]}")

        parent_idx = token.parent.tokens.index(token)
        for preceding_token in token.parent.tokens[
                parent_idx -
                1::-1]:  # Subtract the length of all preceding siblings
            idx -= len(preceding_token.value)
        grand_parent_idx = token.parent.parent.tokens.index(
            token.parent
        )  # Move up to the grand parent token. See tests to understand why this is relevant
        for preceding_token in token.parent.parent.tokens[grand_parent_idx -
                                                          1::-1]:
            idx -= len(preceding_token.value)
            if preceding_token.ttype == Keyword and preceding_token.value.lower(
            ) == "in":
                token = preceding_token
                break
        return self.ColumnCaster.cast_non_trivial_tokens(
            sql,
            token,
            idx,
            cast_to,
            result.groupdict(),
            count_forward_tokens=0)
Exemple #12
0
 def escape_char(m: re.match) -> str:
     c = m.group()
     if c == '/':
         return '--'
     elif c == '-':
         return '-m'
     else:
         return '-' + hex(ord(c))[2:]
Exemple #13
0
 def escape_char(m: re.match) -> str:
     c = m.group()
     if c == '/':
         return '--'
     elif c == '-':
         return '-m'
     else:
         return '-' + hex(ord(c))[2:]
Exemple #14
0
    def get_group_str(self, match: re.match = None, group_name: str = ''):
        group_str = ''
        try:
            group_str = match.group(group_name)
        except IndexError:
            pass

        return group_str
Exemple #15
0
 def from_gcov_match(cls, match: re.match):
     (coverage_str, linenum_str, source) = match.groups()
     if coverage_str == '-':
         coverage = -1
     elif coverage_str in ('#####', '====='):
         coverage = 0
     else:
         coverage = int(coverage_str)
     return cls(int(linenum_str), source, coverage)
Exemple #16
0
    def format(self, path: Path, match: re.match):
        """
        Format the path with the result of the matching.
        Only replace what was captured.
        """
        assert match is not None

        # get what is before and after the capture
        prefix = match.string[:match.start()]
        suffix = match.string[match.end():]

        updated_name = file_formatter.format(
            self.renamer,
            None,
            *match.groups(),
            **match.groupdict())

        return self.untouched_root(path) / Path(prefix + updated_name + suffix)
Exemple #17
0
def free_key_quote(match: re.match) -> str:
    '''
	Response to re.sub, returns a quoted free-key if \\2 is a valid python identifier.
	\\1 is assumed to be the indentation level.
	String key is encoded.
	
	>>> import re
	>>> re.sub(r"(\\t*)([^' \\n,\{]+)(?= *:)",free_key_quote,"\\ta:4")
	"\\t'YQ==':4"
	>>> re.sub(r"(\\t*)([^' \\n,\{]+)(?= *:)",free_key_quote,"\\t'a':4")
	"\\t'a':4"
	'''
    level = match.group(1)
    key = match.group(2)
    if is_raw_key(key):
        return level + key
    elif key.isidentifier():
        return f"{level}'{url_encode(key)}'"
Exemple #18
0
 def from_gcov_match(cls, match: re.match):
     (coverage_str, linenum_str, source) = match.groups()
     if coverage_str == '-':
         coverage = -1
     elif coverage_str in ('#####', '====='):
         coverage = 0
     else:
         coverage = int(coverage_str)
     return cls(int(linenum_str), source, coverage)
Exemple #19
0
def change(match: re.match):
    num = match.group()

    if num[0] == '.':
        return '0' + num

    if num[-1] == '.':
        return num + '0'

    return num
Exemple #20
0
    def _wrap_date_match(order: str, match: re.match, pattern: str=None) -> dict or None:
        """

        Args:
            order: enums['MDY', 'DMY', 'YMD'] - order of the date
            match: re.match - a regex match object
            pattern: str - if user defined the pattern, record it here

        Returns:

        """
        return {
            'value': match.group(),
            'groups': match.groups(),
            'start': match.start(),
            'end': match.end(),
            'order': order,
            'pattern': pattern
        } if match else None
def lowercasesircumflexsub(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'ᾷ',
        'H': u'ῇ',
        'W': u'ῷ',
    }

    substitute = substitutions[val]

    return substitute
def lowercaseacutedsub(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'ᾴ',
        'H': u'ῄ',
        'W': u'ῴ',
    }

    substitute = substitutions[val]

    return substitute
def lowercaseroughsub(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'ᾁ',
        'H': u'ᾑ',
        'W': u'ᾡ',
    }

    substitute = substitutions[val]

    return substitute
Exemple #24
0
    def _match_obj_to_date(m: re.match) -> Optional[date]:
        """
        Convert a successful regex match to a datetime.date object

        Called by: lines_in_weeks_out()
        """
        if m:
            # group(3) is the year, group(1) is the month, group(2) is the day
            dt = [int(m.group(x)) for x in (3, 1, 2)]
            return datetime.date(dt[0], dt[1], dt[2])  # year, month, day
        else:
            return None
def lowercasesmoothsub(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'ᾀ',
        'H': u'ᾐ',
        'W': u'ᾠ',
    }

    substitute = substitutions[val]

    return substitute
def lowercasegravesub(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'ᾲ',
        'H': u'ῂ',
        'W': u'ῲ',
    }

    substitute = substitutions[val]

    return substitute
    def replace(self, match: re.match, msg: str) -> str:
        email = match.group()

        # Return the details unchanged if they look like Postfix message ID
        if self.is_message_id(match, msg):
            return email

        # The "@" can show up in the local part, but shouldn't appear in the
        # domain part (at least not that we know).
        local, domain = email.rsplit("@", 1)

        local = self.mask_local(local)
        domain = self.mask_domain(domain)

        return local + '@' + domain
def lowercaseroughcircumflex(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'ἇ',
        'E': u'ἑ͂',  # IG: TE=S BOLE=S E(=I
        'I': u'ἷ',
        'O': u'ὁ͂',  # IG: PE]RI\ DE\ O(=[N !]DIK
        'U': u'ὗ',
        'H': u'ἧ',
        'W': u'ὧ',
    }

    substitute = substitutions[val]

    return substitute
def lowercaseacute(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'ά',
        'E': u'έ',
        'I': u'ί',
        'O': u'ό',
        'U': u'ύ',
        'H': u'ή',
        'W': u'ώ',
    }

    substitute = substitutions[val]

    return substitute
def lowercasegrave(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'ὰ',
        'E': u'ὲ',
        'I': u'ὶ',
        'O': u'ὸ',
        'U': u'ὺ',
        'H': u'ὴ',
        'W': u'ὼ',
    }

    substitute = substitutions[val]

    return substitute
def lowercasesircumflexdiaresis(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'',
        'E': u'',
        'I': u'',
        'O': u'',
        'U': u'ῧ',
        'H': u'',
        'W': u'',
    }

    substitute = substitutions[val]

    return substitute
def capitalsmoothgraveadscript(match: re.match, g=1) -> str:
    val = match.group(g)

    substitutions = {
        'A': u'ᾊ',
        'E': u'',
        'I': u'',
        'O': u'',
        'U': u'',
        'H': u'ᾚ',
        'W': u'ᾪ',
    }

    substitute = substitutions[val]

    return substitute
Exemple #33
0
 def from_gcov_match(cls, linenum: int, match: re.match):
     (name, called_str, returned_str, blocks_str) = match.groups()
     return cls(name, linenum, int(called_str), int(returned_str), int(blocks_str))
Exemple #34
0
 def from_gcov_match(cls, match: re.match):
     count = int(match.group('count') or '0')
     id_ = int(match.group('id'))
     type_ = match.group('type')
     info = match.group('info')
     return cls(count, id_, type_, info)