def test_bad_reduction_bug():
	# DEFECT: "0{2}|1{2}" was erroneously reduced() to "[01]{2}"
	bad = parse("0{2}|1{2}").to_fsm({"0", "1", fsm.anything_else})
	assert bad.accepts("00")
	assert bad.accepts("11")
	assert not bad.accepts("01")
	assert str(parse("0|[1-9]|ab").reduce()) == "\d|ab"
Example #2
0
def test_bug_36_2():
	etc1 = parse("/etc/.*").to_fsm()
	etc2 = parse("/etc/something.*").to_fsm()
	assert etc1.accepts("/etc/something")
	assert etc2.accepts("/etc/something")
	assert not etc1.isdisjoint(etc2)
	assert not etc2.isdisjoint(etc1)
Example #3
0
def format_check_orm_regex(instance):
    # pylint:disable=broad-except
    try:
        lego.parse(instance)
    except Exception:
        return False
    return True
def test_special_cases_for_charclass():
	a = parse('[- ]')
	assert a.matches('-')
	assert a.matches(' ')
	a = parse('[ -]')
	assert a.matches('-')
	assert a.matches(' ')
def is_string_subtype(s1, s2):
    if s2.get("type") != "string":
        return False
    #
    s1 = JsonString(s1)
    s2 = JsonString(s2)
    #
    # uninhabited = handle_uninhabited_types(s1, s2)
    # if uninhabited != None:
    #     return uninhabited
    #
    is_sub_interval = is_sub_interval_from_optional_ranges(
        s1.min, s1.max, s2.min, s2.max)
    if not is_sub_interval:
        return False
    #
    # at this point, length is compatible,
    # so we should now worry about pattern only.
    if s2.pattern == None or s2.pattern == "":
        return True
    elif s1.pattern == None or s1.pattern == "":
        return False
    elif s1.pattern == s2.pattern:
        return True
    else:
        regex1 = parse(s1.pattern)
        regex2 = parse(s2.pattern)
        result = regex1 & regex2.everythingbut()
        if result.empty():
            return True
        else:
            return False
Example #6
0
def test_bug_36_1():
	etc1 = parse(".*").to_fsm()
	etc2 = parse("s.*").to_fsm()
	assert etc1.accepts("s")
	assert etc2.accepts("s")
	assert not etc1.isdisjoint(etc2)
	assert not etc2.isdisjoint(etc1)
def test_parse_anchors():
	assert str(parse(r"\ba\b")) == r"\ba\b"
	assert str(parse(r"^a$")) == r"^a$"
	assert str(parse(r"\Aa\Z")) == r"\Aa\Z"
	assert str(parse(r"\Ga\z")) == r"\Ga\z"
	a = parse(r"^a$")
	mults = list(list(a.concs)[0].mults)
	assert mults[0] == caret
	assert mults[2] == dollar
Example #8
0
def regex_isProperSubset(s1, s2):
    ''' regex proper subset is quite expensive to compute
        so we try to break it into two separate checks,
        and do the more expensive check, only if the 
        cheaper one passes first.'''
    s1 = parse(s1).reduce()
    s2 = parse(s2).reduce()
    if not s1.equivalent(s2):
        return (s1 & s2.everythingbut()).empty()
    return False
def test_new_reduce():
	# The @reduce_after decorator has been removed from many methods since it
	# takes unnecessary time which the user may not wish to spend.
	# This alters the behaviour of several methods and also exposes a new
	# opportunity for conc.reduce()
	assert conc.parse("a()").reduce() == charclass.parse("a")
	assert conc.parse("a()()").reduce() == charclass.parse("a")
	assert conc.parse("a.b()()").reduce() == conc.parse("a.b")
	assert str(parse("a.b()()")) == "a.b()()"
	assert str(parse("a.b()()").reduce()) == "a.b"
Example #10
0
def test_hex_escapes():
	# Should be able to parse e.g. "\\x40"
	assert parse("\\x00") == parse("\x00")
	assert parse("\\x40") == parse("@")
	assert parse("[\\x40]") == parse("[@]")
	assert parse("[\\x41-\\x5a]") == parse("[A-Z]")
	assert str(parse("\\x09")) == "\\t" # escape sequences are not preserved

	# Printing ASCII control characters? You should get hex escapes
	assert str(parse("\\x00")) == "\\x00"
Example #11
0
def regex_meet(s1, s2):
    if s1 and s2:
        ret = parse(s1) & parse(s2)
        return str(ret.reduce()) if not ret.empty() else None
    elif s1:
        return s1
    elif s2:
        return s2
    else:
        return None
Example #12
0
def test_fsm():
	# You should be able to to_fsm() a single lego piece without supplying a specific
	# alphabet. That should be determinable from context.
	assert str(from_fsm(parse("a.b").to_fsm())) == "a.b" # not "a[ab]b"

	# A suspiciously familiar example
	bad = parse("0{2}|1{2}").to_fsm()
	assert bad.accepts("00")
	assert bad.accepts("11")
	assert not bad.accepts("01")
	assert str(parse("0|[1-9]|ab").reduce()) == "\d|ab"
Example #13
0
def test_silly_reduction():
	# This one is horrendous and we have to jump through some hoops to get to
	# a sensible result. Probably not a good unit test actually.
	long = \
	"(aa|bb*aa)a*|((ab|bb*ab)|(aa|bb*aa)a*b)((ab|bb*ab)|(aa|bb*aa)a*b)*" + \
	"(aa|bb*aa)a*|((ab|bb*ab)|(aa|bb*aa)a*b)((ab|bb*ab)|(aa|bb*aa)a*b)*"
	long = parse(long)
	long = reversed(long.to_fsm())
	long = reversed(from_fsm(long))
	assert str(long) == "[ab]*a[ab]"
	short = "[ab]*a?b*|[ab]*b?a*"
	assert str(parse(".*") & parse(short)) == "[ab]*"
Example #14
0
def test_silly_reduction():
	# This one is horrendous and we have to jump through some hoops to get to
	# a sensible result. Probably not a good unit test actually.
	long = \
	"(aa|bb*aa)a*|((ab|bb*ab)|(aa|bb*aa)a*b)((ab|bb*ab)|(aa|bb*aa)a*b)*" + \
	"(aa|bb*aa)a*|((ab|bb*ab)|(aa|bb*aa)a*b)((ab|bb*ab)|(aa|bb*aa)a*b)*"
	long = parse(long)
	long = reversed(long.to_fsm())
	long = reversed(from_fsm(long))
	assert str(long) == "[ab]*a[ab]"
	short = "[ab]*a?b*|[ab]*b?a*"
	assert str(parse(".*") & parse(short)) == "[ab]*"
Example #15
0
def verify(subregex, supregex):
	# supregex : "\d" 
	# subregex : "\d{3}"

	p_subregex = parse(".*" + subregex + ".*")
	p_supregex = parse(".*" + supregex + ".*")

	s = p_subregex&(p_supregex.everythingbut())

	if s.empty():
		print("Verified" + "------    "+subregex + " " + supregex)
	else:
		print("Not pass!" + "------    "+ subregex + " " + supregex)
Example #16
0
def test_fsm():
	# You should be able to to_fsm() a single lego piece without supplying a specific
	# alphabet. That should be determinable from context.
	assert parse("a.b").to_fsm().accepts("acb")

	bad = parse("0{2}|1{2}").to_fsm({"0", "1", fsm.anything_else})
	assert bad.accepts("00")
	assert bad.accepts("11")
	assert not bad.accepts("01")

	bad = parse("0{2}|1{2}").to_fsm()
	assert bad.accepts("00")
	assert bad.accepts("11")
	assert not bad.accepts("01")
Example #17
0
def test_fsm():
	# You should be able to to_fsm() a single lego piece without supplying a specific
	# alphabet. That should be determinable from context.
	assert parse("a.b").to_fsm().accepts("acb")

	bad = parse("0{2}|1{2}").to_fsm({"0", "1", fsm.anything_else})
	assert bad.accepts("00")
	assert bad.accepts("11")
	assert not bad.accepts("01")

	bad = parse("0{2}|1{2}").to_fsm()
	assert bad.accepts("00")
	assert bad.accepts("11")
	assert not bad.accepts("01")
Example #18
0
    def difference(self, other):
        """Find the difference of two regexps.

        This method uses greenery library to find and
        reduce the difference pattern between two regex's.

        * If `other` is a string, a Python dict or a FiniteSet,
          it is converted to a regex pattern, after which it
          is parsed by `greenery.lego.parse` method and its
          difference with the pattern of the `self` is found.
          See more details here:
          https://github.com/qntm/greenery

        * If `other` is an instance of `EmpySet`, the difference
          is a copy of `self`.

        * If `other` is an instance of `UniversalSet`, the difference
          is an instance of `EmptySet`.

        Parameters
        ----------
        other : set, str, re._pattern_type, RegexSet

        Returns
        -------
        result : RegexSet
            The union set
        """
        if self.pattern is None:
            return RegexSet.empty()

        other_exp = []

        if isinstance(other, set):
            for exp in other:
                exp_str = _regex_to_string(exp)
                if exp_str is not None:
                    other_exp.append(parse(exp_str))
        else:
            other_str = _regex_to_string(other)
            if other_str is not None:
                other_exp.append(parse(other_str))
            else:
                return self.copy()
        complement_exp = parse(self.pattern)
        for exp in other_exp:
            complement_exp = complement_exp.difference(exp)

        return RegexSet(str(complement_exp.reduce()))
Example #19
0
    def difference(self, other):
        """Find the difference of two regexps.

        This method uses greenery library to find and
        reduce the difference pattern between two regex's.

        * If `other` is a string, a Python dict or a FiniteSet,
          it is converted to a regex pattern, after which it
          is parsed by `greenery.lego.parse` method and its
          difference with the pattern of the `self` is found.
          See more details here:
          https://github.com/qntm/greenery

        * If `other` is an instance of `EmpySet`, the difference
          is a copy of `self`.

        * If `other` is an instance of `UniversalSet`, the difference
          is an instance of `EmptySet`.

        Parameters
        ----------
        other : set, str, re._pattern_type, RegexSet

        Returns
        -------
        result : RegexSet
            The union set
        """
        if self.pattern is None:
            return RegexSet.empty()

        other_exp = []

        if isinstance(other, set):
            for exp in other:
                exp_str = _regex_to_string(exp)
                if exp_str is not None:
                    other_exp.append(parse(exp_str))
        else:
            other_str = _regex_to_string(other)
            if other_str is not None:
                other_exp.append(parse(other_str))
            else:
                return self.copy()
        complement_exp = parse(self.pattern)
        for exp in other_exp:
            complement_exp = complement_exp.difference(exp)

        return RegexSet(str(complement_exp.reduce()))
Example #20
0
 def included(a):
     if isinstance(a, str):
         other_exp = parse(a)
     elif isinstance(a, re._pattern_type):
         other_exp = parse(a.pattern)
     elif isinstance(a, RegexSet):
         if a.pattern:
             other_exp = parse(a.pattern)
         else:
             return False
     else:
         raise AttributeSetError(
             "Regexp object should be of type `str` or `re._pattern_type`!"
         )
     return (self_exp & other_exp.everythingbut()).empty()
Example #21
0
 def included(a):
     if isinstance(a, str):
         other_exp = parse(a)
     elif isinstance(a, re._pattern_type):
         other_exp = parse(a.pattern)
     elif isinstance(a, RegexSet):
         if a.pattern:
             other_exp = parse(a.pattern)
         else:
             return False
     else:
         raise AttributeSetError(
             "Regexp object should be of type `str` or `re._pattern_type`!"
         )
     return (self_exp & other_exp.everythingbut()).empty()
Example #22
0
def make_regex(pattern):
    if pattern is None:
        return None

    expression = []
    for n, element in enumerate(pattern):
        if element:
            if len(element) > 1:
                expression.append('[{}]'.format(''.join(element)))
            else:
                expression.append(element)
        else:
            # hack with completely optional None characters
            # (specifying length may yield an incorrect pattern)
            expression.append('.*')

    # no_end_chars = {'$'}
    # if expression[-1] not in no_end_chars:
    #     expression.append('$')

    # optimise the expression with lego

    expression = lego.parse(''.join(expression))

    return expression
Example #23
0
def test_statement_regex_mutual_exclusivity():
    fsa_list = [
        lego.parse(deverbosify(module._STATEMENT_REGEX.pattern))
        for module in PROBE_MODULES
    ]
    for fsa1, fsa2 in itertools.combinations(fsa_list, 2):
        yield assert_non_overlapping, fsa1, fsa2
Example #24
0
    def create_pfsm_from_fsm(self, ):
        fsm_obj = parse(self.reg_exp).to_fsm()

        self.alphabet = list(
            set([str(i) for i in list(fsm_obj.alphabet)]) - set([
                'anything_else',
            ]))

        states = list(fsm_obj.states)
        self.add_states(states)

        initials = [
            fsm_obj.initial,
        ]
        I = [
            np.log(1 / len(initials)) if state in initials else LOG_EPS
            for state in self.states
        ]
        self.set_I(I)
        self.I_backup = self.I.copy()

        finals = list(fsm_obj.finals)
        F = [
            np.log(self.STOP_P) if state in finals else LOG_EPS
            for state in self.states
        ]
        self.set_F(F)
        self.F_backup = self.F.copy()

        transitions = fsm_obj.map
        for state_i in transitions:
            trans = transitions[state_i]

            for symbol in list(trans):
                if str(symbol) == 'anything_else':
                    del trans[symbol]
            transitions[state_i] = trans

        for state_i in transitions:
            trans = transitions[state_i]
            state_js = np.array(list(trans.values()))
            if len(state_js) == 0:
                self.F[state_i] = 0.
            else:
                symbols_js = np.array(list(trans.keys()))
                if self.F[state_i] != LOG_EPS:
                    probs = np.array([
                        (1.0 - np.exp(self.F[state_i])) / len(symbols_js)
                        for i in range(len(symbols_js))
                    ])
                else:
                    probs = np.array([
                        1.0 / len(symbols_js) for i in range(len(symbols_js))
                    ])

                for state_j in np.unique(state_js):
                    idx = np.where(state_js == state_j)[0]
                    symbols = list(symbols_js[idx])
                    self.add_transitions(state_i, state_j, symbols,
                                         list(probs[idx]))
Example #25
0
def regex_isSubset(s1, s2):
    ''' regex subset is quite expensive to compute
        especially for complex patterns. '''
    if s1 and s2:
        s1 = parse(s1).reduce()
        s2 = parse(s2).reduce()
        try:
            s1.cardinality()
            s2.cardinality()
            return set(s1.strings()).issubset(s2.strings())
        except OverflowError:
            return s1.equivalent(s2) or (s1 & s2.everythingbut()).empty()
    elif s1:
        return True
    elif s2:
        return False
Example #26
0
def test_complexify():
	# Complexify!
	gen = (parse("[bc]*[ab]*") & parse("[ab]*[bc]*")).strings()
	assert next(gen) == ""
	assert next(gen) == "a"
	assert next(gen) == "b"
	assert next(gen) == "c"
	assert next(gen) == "aa"
	assert next(gen) == "ab"
	# no "ac"
	assert next(gen) == "ba"
	assert next(gen) == "bb"
	assert next(gen) == "bc"
	# no "ca"
	assert next(gen) == "cb"
	assert next(gen) == "cc"
	assert next(gen) == "aaa"
Example #27
0
def test_block_comment_regex():
	# I went through several incorrect regexes for C block comments. Here we show
	# why the first few attempts were incorrect
	a = parse("/\\*(([^*]|\\*+[^*/])*)\\*/")
	assert a.matches("/**/")
	assert not a.matches("/***/")
	assert not a.matches("/****/")

	b = parse("/\\*(([^*]|\\*[^/])*)\\*/")
	assert b.matches("/**/")
	assert not b.matches("/***/")
	assert b.matches("/****/")

	c = parse("/\\*(([^*]|\\*+[^*/])*)\\*+/")
	assert c.matches("/**/")
	assert c.matches("/***/")
	assert c.matches("/****/")
Example #28
0
def test_block_comment_regex():
	# I went through several incorrect regexes for C block comments. Here we show
	# why the first few attempts were incorrect
	a = parse("/\\*(([^*]|\\*+[^*/])*)\\*/")
	assert a.matches("/**/")
	assert not a.matches("/***/")
	assert not a.matches("/****/")

	b = parse("/\\*(([^*]|\\*[^/])*)\\*/")
	assert b.matches("/**/")
	assert not b.matches("/***/")
	assert b.matches("/****/")

	c = parse("/\\*(([^*]|\\*+[^*/])*)\\*+/")
	assert c.matches("/**/")
	assert c.matches("/***/")
	assert c.matches("/****/")
Example #29
0
def test_complexify():
	# Complexify!
	gen = (parse("[bc]*[ab]*") & parse("[ab]*[bc]*")).strings()
	assert next(gen) == ""
	assert next(gen) == "a"
	assert next(gen) == "b"
	assert next(gen) == "c"
	assert next(gen) == "aa"
	assert next(gen) == "ab"
	# no "ac"
	assert next(gen) == "ba"
	assert next(gen) == "bb"
	assert next(gen) == "bc"
	# no "ca"
	assert next(gen) == "cb"
	assert next(gen) == "cc"
	assert next(gen) == "aaa"
Example #30
0
def test_everythingbut():
	# Regexes are usually gibberish but we make a few claims
	a = parse("a")
	notA = a.everythingbut().to_fsm()
	assert notA.accepts("")
	assert not notA.accepts("a")
	assert notA.accepts("aa")

	# everythingbut(), called twice, should take us back to where we started.
	beer = parse("beer")
	notBeer = beer.everythingbut()
	beer2 = notBeer.everythingbut()
	assert str(beer2) == "be{2}r"

	# ".*" becomes "[]" and vice versa under this call.
	everything = parse(".*")
	assert str(everything.everythingbut()) == str(nothing)
	assert str(nothing.everythingbut()) == str(everything)
Example #31
0
def test_everythingbut():
	# Regexes are usually gibberish but we make a few claims
	a = parse("a")
	notA = a.everythingbut().to_fsm()
	assert notA.accepts("")
	assert not notA.accepts("a")
	assert notA.accepts("aa")

	# everythingbut(), called twice, should take us back to where we started.
	beer = parse("beer")
	notBeer = beer.everythingbut()
	beer2 = notBeer.everythingbut()
	assert str(beer2) == "be{2}r"

	# ".*" becomes "[]" and vice versa under this call.
	everything = parse(".*")
	assert str(everything.everythingbut()) == str(nothing)
	assert str(nothing.everythingbut()) == str(everything)
Example #32
0
def test_charclass_str():
	assert str(w) == "\\w"
	assert str(d) == "\\d"
	assert str(s) == "\\s"
	assert str(charclass("a")) == "a"
	assert str(charclass("{")) == "\\{"
	assert str(charclass("\t")) == "\\t"
	assert str(charclass("ab")) == "[ab]"
	assert str(charclass("a{")) == "[a{]"
	assert str(charclass("a\t")) == "[\\ta]"
	assert str(charclass("a-")) == "[\\-a]"
	assert str(charclass("a[")) == "[\\[a]"
	assert str(charclass("a]")) == "[\\]a]"
	assert str(charclass("ab")) == "[ab]"
	assert str(charclass("abc")) == "[abc]"
	assert str(charclass("abcd")) == "[a-d]"
	assert str(charclass("abcdfghi")) == "[a-df-i]"
	assert str(charclass("^")) == "^"
	assert str(charclass("\\")) == "\\\\"
	assert str(charclass("a^")) == "[\\^a]"
	assert str(charclass("0123456789a")) == "[0-9a]"
	assert str(charclass("\t\v\r A")) == "[\\t\\v\\r A]"
	assert str(charclass("\n\f A")) == "[\\n\\f A]"
	assert str(charclass("\t\n\v\f\r A")) == "[\\t-\\r A]"
	assert str(charclass("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz|")) == "[0-9A-Z_a-z|]"
	assert str(W) == "\\W"
	assert str(D) == "\\D"
	assert str(S) == "\\S"
	assert str(dot) == "."
	assert str(~charclass("")) == "."
	assert str(~charclass("a")) == "[^a]"
	assert str(~charclass("{")) == "[^{]"
	assert str(~charclass("\t")) == "[^\\t]"
	assert str(~charclass("^")) == "[^\\^]"

	# Arbitrary ranges
	assert str(parse("[\\w:;<=>?@\\[\\\\\\]\\^`]")) == "[0-z]"
	# TODO: what if \d is a proper subset of `chars`?

	# escape sequences are not preserved
	assert str(parse("\\x09")) == "\\t"

	# Printing ASCII control characters? You should get hex escapes
	assert str(parse("\\x00")) == "\\x00"
Example #33
0
def test_charclass_str():
	assert str(w) == "\\w"
	assert str(d) == "\\d"
	assert str(s) == "\\s"
	assert str(charclass("a")) == "a"
	assert str(charclass("{")) == "\\{"
	assert str(charclass("\t")) == "\\t"
	assert str(charclass("ab")) == "[ab]"
	assert str(charclass("a{")) == "[a{]"
	assert str(charclass("a\t")) == "[\\ta]"
	assert str(charclass("a-")) == "[\\-a]"
	assert str(charclass("a[")) == "[\\[a]"
	assert str(charclass("a]")) == "[\\]a]"
	assert str(charclass("ab")) == "[ab]"
	assert str(charclass("abc")) == "[abc]"
	assert str(charclass("abcd")) == "[a-d]"
	assert str(charclass("abcdfghi")) == "[a-df-i]"
	assert str(charclass("^")) == "^"
	assert str(charclass("\\")) == "\\\\"
	assert str(charclass("a^")) == "[\\^a]"
	assert str(charclass("0123456789a")) == "[0-9a]"
	assert str(charclass("\t\v\r A")) == "[\\t\\v\\r A]"
	assert str(charclass("\n\f A")) == "[\\n\\f A]"
	assert str(charclass("\t\n\v\f\r A")) == "[\\t-\\r A]"
	assert str(charclass("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz|")) == "[0-9A-Z_a-z|]"
	assert str(W) == "\\W"
	assert str(D) == "\\D"
	assert str(S) == "\\S"
	assert str(dot) == "."
	assert str(~charclass("")) == "."
	assert str(~charclass("a")) == "[^a]"
	assert str(~charclass("{")) == "[^{]"
	assert str(~charclass("\t")) == "[^\\t]"
	assert str(~charclass("^")) == "[^\\^]"

	# Arbitrary ranges
	assert str(parse("[\w:;<=>?@\\[\\\\\]\\^`]")) == "[0-z]"
	# TODO: what if \d is a proper subset of `chars`?

	# escape sequences are not preserved
	assert str(parse("\\x09")) == "\\t"

	# Printing ASCII control characters? You should get hex escapes
	assert str(parse("\\x00")) == "\\x00"
Example #34
0
def issubset(regex1, regex2):
    # return True if regex1 is subset of regex2, which means
    # all the strings matched by r1 is can be matched by r2
    subregex = parse(regex1)
    supregex = parse(regex2)
    s = subregex & (supregex.everythingbut())
    start = time.time()
    elapsed = 0
    if s.empty():
        return "Verified"
    else:
        counterexample = ""
        while elapsed < 30:
            if not counterexample == "Timeout":
                break
            generator = s.strings()
            counterexample = next(generator)
            elapsed = time.time() - start
        return str(counterexample)
Example #35
0
def test_wildcard_generator():
	# Generator needs to handle wildcards as well. Wildcards come last.
	gen = parse("a.b").strings(otherchar="*")
	assert next(gen) == "aab"
	assert next(gen) == "abb"
	assert next(gen) == "a*b"
	try:
		next(gen)
		assert False
	except StopIteration:
		assert True
Example #36
0
def test_wildcard_generator():
	# Generator needs to handle wildcards as well. Wildcards come last.
	gen = parse("a.b").strings(otherchar="*")
	assert next(gen) == "aab"
	assert next(gen) == "abb"
	assert next(gen) == "a*b"
	try:
		next(gen)
		assert False
	except StopIteration:
		assert True
Example #37
0
def regex_isSubset(s1, s2):
    ''' regex subset is quite expensive to compute
        especially for complex patterns. '''
    if s1 and s2:
        s1 = parse(s1).reduce()
        s2 = parse(s2).reduce()
        try:
            s1.cardinality()
            s2.cardinality()
            return set(s1.strings()).issubset(s2.strings())
        except (OverflowError, Exception):
            # catching a general exception thrown from greenery
            # see https://github.com/qntm/greenery/blob/master/greenery/lego.py
            # ... raise Exception("Please choose an 'otherchar'")
            return s1.equivalent(s2) or (s1 & s2.everythingbut()).empty()
        except Exception as e:
            exit_with_msg("regex failure from greenry", e)
    elif s1:
        return True
    elif s2:
        return False
Example #38
0
    def issubset(self, other):
        """Test regexp inclusion relation.

        Tests if a set defined by `self` is a included
        in a set defined by `other`.

        Parameters
        ----------
        other : set, str, re._pattern_type, RegexSet
            Another regex to test inclusion.

        Returns
        -------
        `True` is `self` defines a subset of `other`, `False` otherwise

        Raises
        ------
        AttributeSetError
            If the type `other` is not recognized.
        """
        if self.pattern is None:
            return True
        else:
            self_exp = parse(self.pattern)

            def included(a):
                if isinstance(a, str):
                    other_exp = parse(a)
                elif isinstance(a, re._pattern_type):
                    other_exp = parse(a.pattern)
                elif isinstance(a, RegexSet):
                    if a.pattern:
                        other_exp = parse(a.pattern)
                    else:
                        return False
                else:
                    raise AttributeSetError(
                        "Regexp object should be of type `str` or `re._pattern_type`!"
                    )
                return (self_exp & other_exp.everythingbut()).empty()

            if isinstance(other, set):
                res = True

                for element in other:
                    if element is not None and not included(element):
                        res = False
                        break
            else:
                res = included(other)
            return res
Example #39
0
    def issubset(self, other):
        """Test regexp inclusion relation.

        Tests if a set defined by `self` is a included
        in a set defined by `other`.

        Parameters
        ----------
        other : set, str, re._pattern_type, RegexSet
            Another regex to test inclusion.

        Returns
        -------
        `True` is `self` defines a subset of `other`, `False` otherwise

        Raises
        ------
        AttributeSetError
            If the type `other` is not recognized.
        """
        if self.pattern is None:
            return True
        else:
            self_exp = parse(self.pattern)

            def included(a):
                if isinstance(a, str):
                    other_exp = parse(a)
                elif isinstance(a, re._pattern_type):
                    other_exp = parse(a.pattern)
                elif isinstance(a, RegexSet):
                    if a.pattern:
                        other_exp = parse(a.pattern)
                    else:
                        return False
                else:
                    raise AttributeSetError(
                        "Regexp object should be of type `str` or `re._pattern_type`!"
                    )
                return (self_exp & other_exp.everythingbut()).empty()

            if isinstance(other, set):
                res = True

                for element in other:
                    if element is not None and not included(element):
                        res = False
                        break
            else:
                res = included(other)
            return res
Example #40
0
def test_infinite_generation():
	# Infinite generator, flummoxes both depth-first and breadth-first searches
	gen = parse("a*b*").strings()
	assert next(gen) == ""
	assert next(gen) == "a"
	assert next(gen) == "b"
	assert next(gen) == "aa"
	assert next(gen) == "ab"
	assert next(gen) == "bb"
	assert next(gen) == "aaa"
	assert next(gen) == "aab"
	assert next(gen) == "abb"
	assert next(gen) == "bbb"
	assert next(gen) == "aaaa"
Example #41
0
def test_infinite_generation():
	# Infinite generator, flummoxes both depth-first and breadth-first searches
	gen = parse("a*b*").strings()
	assert next(gen) == ""
	assert next(gen) == "a"
	assert next(gen) == "b"
	assert next(gen) == "aa"
	assert next(gen) == "ab"
	assert next(gen) == "bb"
	assert next(gen) == "aaa"
	assert next(gen) == "aab"
	assert next(gen) == "abb"
	assert next(gen) == "bbb"
	assert next(gen) == "aaaa"
 def dfa_from_regex(s, alphabet=None):
     """
     Using greenery to convert regex to a minimal (canonical) DFA
     :param str s: the input regular expression
     :param str alphabet: (optional) the alphabet for the required output DFA
     :return: MinDFA object, with language equivalent to the input's regex language
     """
     # TODO: consider runtime impact for using alphabet...
     # alphabet = None
     f = parse(s).to_fsm(alphabet)
     # for canonical rep -- transform to minimal MinDFA
     f.reduce()
     res = MinDFA.dfa_from_fsm(f)
     # TODO: currently assuming input str as regex only has '*' operator for infinity
     if '*' not in s:
         res.is_all_words = MinDFA.Ternary.FALSE
     return res
Example #43
0
def regex_to_dfa():
    received_json = request.get_json(silent=True)

    received_regex = received_json['regex']
    constructed_regex = lego.parse(received_regex)
    constructed_fsm = constructed_regex.to_fsm()

    alphabet = list(constructed_fsm.alphabet)
    prepared_alphabet = [letter for letter in alphabet if not isinstance(letter, fsm.anything_else_cls)]

    response = {
        "alphabet": prepared_alphabet,
        "states": list(constructed_fsm.states),
        "finals": list(constructed_fsm.finals),
        "initial": str(constructed_fsm.initial),
        "map": constructed_fsm.map,
    }

    return jsonify(response)
Example #44
0
    def pfsm_from_fsm(self, reg_exp):
        fsm_obj = parse(reg_exp).to_fsm()

        self.alphabet = sorted(
            [str(i) for i in list(fsm_obj.alphabet) if str(i) != "anything_else"]
        )
        self.add_states(list(fsm_obj.states))
        self.set_I(
            [np.log(1) if q == fsm_obj.initial else LOG_EPS for q in self.states]
        )
        self.set_F(
            [
                np.log(self.STOP_P) if q in list(fsm_obj.finals) else LOG_EPS
                for q in self.states
            ]
        )

        for q_i in fsm_obj.map:
            transition = {
                symbol: v
                for symbol, v in fsm_obj.map[q_i].items()
                if str(symbol) != "anything_else"
            }

            q_js = np.array(list(transition.values()))
            if len(q_js) == 0:
                self.F[q_i] = 0.0
            else:
                symbols_js = np.array(list(transition.keys()))
                dividend = 1.0 if self.F[q_i] == LOG_EPS else 1.0 - np.exp(self.F[q_i])
                probs = np.array([dividend / len(symbols_js) for _ in symbols_js])

                for q_j in np.unique(q_js):
                    idx = np.where(q_js == q_j)[0]
                    self.add_transitions(
                        q_i, q_j, list(symbols_js[idx]), list(probs[idx])
                    )
Example #45
0
def test_bug_slow():
	# issue #43
	import time
	m = fsm.fsm(
		alphabet = {'R', 'L', 'U', 'D'},
		states = {
			0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
			11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
		initial = 0,
		finals = {20},
		map = {0: {'D': 1, 'U': 2},
			   1: {'L': 3},
			   2: {'L': 4},
			   3: {'U': 5},
			   4: {'D': 6},
			   5: {'R': 7},
			   6: {'R': 8},
			   7: {'U': 9},
			   8: {'D': 10},
			   9: {'L': 11},
			   10: {'L': 12},
			   11: {'L': 13},
			   12: {'L': 14},
			   13: {'D': 15},
			   14: {'U': 16},
			   15: {'R': 17},
			   16: {'R': 18},
			   17: {'D': 19},
			   18: {'U': 19},
			   19: {'L': 20},
			   20: {}})
	t1 = time.time()
	l = from_fsm(m)
	t2 = time.time()
	assert (t2 - t1) < 60 # should finish in way under 1s
	assert l == parse("(DLURULLDRD|ULDRDLLURU)L").reduce()
Example #46
0
def test_pattern_parsing():
	assert pattern.parse("abc|def(ghi|jkl)") == pattern(
		conc(
			mult(charclass("a"), one),
			mult(charclass("b"), one),
			mult(charclass("c"), one),
		),
		conc(
			mult(charclass("d"), one),
			mult(charclass("e"), one),
			mult(charclass("f"), one),
			mult(
				pattern(
					conc(
						mult(charclass("g"), one),
						mult(charclass("h"), one),
						mult(charclass("i"), one),
					),
					conc(
						mult(charclass("j"), one),
						mult(charclass("k"), one),
						mult(charclass("l"), one),
					),
				), one
			),
		)
	)

	# Accept the "non-capturing group" syntax, "(?: ... )" but give it no
	# special significance
	assert parse("(?:)") == parse("()")
	assert parse("(?:abc|def)") == parse("(abc|def)")
	parse("(:abc)") # should give no problems

	# Named groups
	assert pattern.parse("(?P<ng1>abc)") == parse("(abc)")
Example #47
0
def test_pattern_parsing():
	assert pattern.parse("abc|def(ghi|jkl)") == pattern(
		conc(
			mult(charclass("a"), one),
			mult(charclass("b"), one),
			mult(charclass("c"), one),
		),
		conc(
			mult(charclass("d"), one),
			mult(charclass("e"), one),
			mult(charclass("f"), one),
			mult(
				pattern(
					conc(
						mult(charclass("g"), one),
						mult(charclass("h"), one),
						mult(charclass("i"), one),
					),
					conc(
						mult(charclass("j"), one),
						mult(charclass("k"), one),
						mult(charclass("l"), one),
					),
				), one
			),
		)
	)

	# Accept the "non-capturing group" syntax, "(?: ... )" but give it no
	# special significance
	assert parse("(?:)") == parse("()")
	assert parse("(?:abc|def)") == parse("(abc|def)")
	parse("(:abc)") # should give no problems

	# Named groups
	assert pattern.parse("(?P<ng1>abc)") == parse("(abc)")
Example #48
0
def test_isinstance_bug():
	# Problem relating to isinstance(). The class "mult" was occurring as both
	# lego.mult and as __main__.mult and apparently these count as different
	# classes for some reason, so isinstance(m, mult) was returning false.
	starfree = (parse("").everythingbut() + parse("aa") + parse("").everythingbut()).everythingbut()
Example #49
0
def regex_matches_string(regex=None, s=None):
    if regex:
        return parse(regex).matches(s)
    else:
        return True
Example #50
0
def test_hex_escapes():
	# Should be able to parse e.g. "\\x40"
	assert parse("\\x00") == parse("\x00")
	assert parse("\\x40") == parse("@")
	assert parse("[\\x40]") == parse("[@]")
	assert parse("[\\x41-\\x5a]") == parse("[A-Z]")
Example #51
0
def test_set_ops():
	assert parse("[abcd]") - parse("a") == charclass.parse("[bcd]")
	assert parse("[abcd]") ^ parse("[cdef]") == charclass.parse("[abef]")
Example #52
0
def complement_of_string_pattern(s):
    return str(parse(s).everythingbut().reduce())
def test_statement_regex_mutual_exclusivity():
    fsa_list = [lego.parse(deverbosify(module._STATEMENT_REGEX.pattern))
                for module in PROBE_MODULES]
    for fsa1, fsa2 in itertools.combinations(fsa_list, 2):
        yield assert_non_overlapping, fsa1, fsa2
Example #54
0
    def intersection(self, other):
        """Find the intersection of two regexps.

        This method uses greenery library to find and
        reduce the intersection pattern.

        * If `other` is a string, a Python dict or a FiniteSet,
          it is converted to a regex pattern, after which it
          is parsed by `greenery.lego.parse` method and its
          intersection with the pattern of the `self` is found.
          The library `greenery` finds the intersection between two
          regex's by constructing corresponding FSM's (finite state
          machines) and finding their intersection, after which it
          is converted back to a regex. See more details here:
          https://github.com/qntm/greenery

        * If `other` is an instance of `EmpySet`, the intersection
          is a `EmpySet` object.

        * If `other` is an instance of `UniversalSet`, the intersection
          is a copy of `self`.

        Parameters
        ----------
        other : set, str, re._pattern_type, RegexSet

        Returns
        -------
        result : RegexSet
            The union set
        """
        if self.pattern is None:
            return RegexSet.empty()

        if self.is_universal():
            if isinstance(other, set):
                universal_flag = True
                other_exp = []
                for el in other:
                    exp = RegexSet(_regex_to_string(el))
                    other_exp.append(exp)
                    if not exp.is_universal():
                        universal_flag = False
                if universal_flag:
                    return RegexSet.universal()
                else:
                    result_obj = RegexSet.empty()
                    for exp in other_exp:
                        result_obj.union(exp)
                    return result_obj
            else:
                other_obj = RegexSet(_regex_to_string(other))
                if other_obj.is_universal():
                    return RegexSet.universal()
                else:
                    return other_obj

        self_exp = parse(self.pattern)

        other_exp = []
        if isinstance(other, set):
            for exp in other:
                exp_str = _regex_to_string(exp)
                if exp_str is None:
                    return RegexSet.empty()
                other_exp.append(parse(exp_str))
        elif isinstance(other, UniversalSet):
            return copy.deepcopy(self)
        elif isinstance(other, EmptySet):
            return EmptySet()
        else:
            other_str = _regex_to_string(other)
            if other_str is None:
                return RegexSet.empty()
            other_exp.append(parse(other_str))

        intersect_exp = self_exp
        for exp in other_exp:
            intersect_exp = intersect_exp.intersection(exp)

        return RegexSet(str(intersect_exp))
Example #55
0
def test_regex_reversal():
	assert reversed(parse("b")) == parse("b")
	assert reversed(parse("e*")) == parse("e*")
	assert reversed(parse("bear")) == parse("raeb")
	assert reversed(parse("beer")) == parse("reeb")
	assert reversed(parse("abc|def|ghi")) == parse("cba|fed|ihg")
	assert reversed(parse("(abc)*d")) == parse("d(cba)*")
Example #56
0
import sys
from greenery.lego import parse
import re

subregex = parse(sys.argv[1])
supregex = parse(sys.argv[2])
s = subregex&(supregex.everythingbut())
if s.empty():
	print("subset")
else:
	print("notsubset")
Example #57
0
# This code is in the public domain.

# http://qntm.org/greenery

import sys
from greenery.lego import parse

regexes = sys.argv[1:]

if len(regexes) < 2:
    print("Please supply several regexes to compute their intersection, union and concatenation.")
    print("E.g. \"19.*\" \"\\d{4}-\\d{2}-\\d{2}\"")

else:
    p = parse(regexes[0])
    for regex in regexes[1:]:
        p &= parse(regex)
    print("Intersection:  %s" % ( p ))

    p = parse(regexes[0])
    for regex in regexes[1:]:
        p |= parse(regex)
    print("Union:         %s" % ( p ))

    p = parse(regexes[0])
    for regex in regexes[1:]:
        p += parse(regex)
    print("Concatenation: %s" % ( p ))

Example #58
0
import sys
from greenery.lego import lego, parse

regexes = sys.argv[1:]

if len(regexes) < 2:
	print("Please supply several regexes to compute their intersection, union and concatenation.")
	print("E.g. \"19.*\" \"\\d{4}-\\d{2}-\\d{2}\"")

else:
	regexes = [parse(regex) for regex in regexes]
	print("Intersection:  %s" % ( lego.intersection(*regexes).reduce() ))
	print("Union:         %s" % ( lego.union(*regexes).reduce() ))
	print("Concatenation: %s" % ( lego.concatenate(*regexes).reduce() ))

        def _isObjectSubtype(s1, s2):
            ''' The general intuition is that a json object with more keys is more restrictive 
                than a similar object with fewer keys. 
                
                E.g.: if corresponding keys have same shcemas, then 
                {name: {..}, age: {..}} <: {name: {..}}
                {name: {..}, age: {..}} />: {name: {..}}

                So the subtype checking is divided into two major parts:
                I) lhs keys/patterns/additional should be a superset of rhs
                II) schemas of comparable keys should have lhs <: rhs
            '''
            if s2.type != "object":
                return False

            # Check properties range
            is_sub_interval = s1.interval in s2.interval
            if not is_sub_interval:
                print_db("__00__")
                return False
            #
            else:
                # If ranges are ok, check another trivial case of almost identical objects.
                # This is some sort of performance heuristic.
                if set(s1.required).issuperset(s2.required) \
                    and s1.properties == s2.properties \
                    and s1.patternProperties == s2.patternProperties \
                    and (s1.additionalProperties == s2.additionalProperties
                         or (utils.is_dict(s1.additionalProperties)
                             and s1.additionalProperties.isSubtype(s2.additionalProperties))):
                    print_db("__01__")
                    return True
            #

            def get_schema_for_key(k, s):
                ''' Searches for matching key and get the corresponding schema(s).
                    Returns iterable because if a key matches more than one pattern, 
                    that key schema has to match all corresponding patterns schemas.
                '''
                if k in s.properties.keys():
                    return [k.properties[k]]
                else:
                    ret = []
                    for k_ in s.patternProperties.keys():
                        if utils.regex_matches_string(k_, k):
                            # in case a key has to be checked against patternProperties,
                            # it has to adhere to all schemas which have pattern matching the key.
                            ret.append(k.patternProperties[k_])
                    if ret:
                        return ret

                return [s.additionalProperties]

            # Check that required keys satisfy subtyping.
            # lhs required keys should be superset of rhs required keys.
            if not set(s1.required).issuperset(s2.required):
                print_db("__02__")
                return False
            # If required keys are properly defined, check their corresponding
            # schemas and make sure they are subtypes.
            # This is required because you could have a required key which does not
            # have an explicit schema defined by the json object.

            else:
                for k in set(s1.required).intersection(s2.required):
                    for lhs_ in get_schema_for_key(k, s1):
                        for rhs_ in get_schema_for_key(k, s2):
                            if lhs_:
                                if rhs_:
                                    if not lhs_.isSubtype(rhs_):
                                        print_db("__03__")
                                        return False
                                else:
                                    print_db("__04__")
                                    return False

            # Missing keys on the rhs
            # I) Simple case:
            # lhs = {"properties": {p1: {string}}
            # rhs = {"properties": {p1: {string}, p2: {int}}}
            # >> this means lhs isNOT subtype of rhs cuz lhs
            # would accept any p2 that does not necesaarily match
            # the type int of the p2 on the rhs
            # II) what if
            # lhs = {"properties": {p1: {string},
            #        "patternProperties": {p2: {int}}}
            # again, ideally this means lhs isNOT subtype of rhs
            # because lhs accept any property name with pattern .*p2.*
            # III) however, the tricky case is: it could happend that
            # every string matched by patternProperties on the lhs exist as a property
            # or property pattern on the rhs, then we need to do picky and enumerative
            # checks cuz it could be that indeed lhs isSubtype of rhs.

            # break it down to subcases
            # if set(s1.properties.keys()).issubset(s2.properties.keys()) \
            #     and len(s1.properties.keys()) < len(s2.properties.keys()) \
            #     and len(s1.patternProperties.keys()) == 0:

            # TODO: The following is very inefficient. Can we do better?
            # lhs_keys = "|".join(k for k in s1.properties.keys(
            # )) + "|".join(utils.regex_unanchor(k) for k in s1.patternProperties.keys())
            # rhs_keys = "|".join(k for k in s2.properties.keys(
            # )) + "|".join(utils.regex_unanchor(k) for k in s2.patternProperties.keys())
            # lhs_keys_proper_subset_rhs_keys = utils.regex_isProperSubset(
            #     lhs_keys, rhs_keys)
            # if lhs_keys_proper_subset_rhs_keys:
            #     print_db("__05__")
            #     return False

            extra_keys_on_rhs = set(s2.properties.keys()).difference(
                s1.properties.keys())
            for k in extra_keys_on_rhs.copy():
                for k_ in s1.patternProperties.keys():
                    if utils.regex_matches_string(k_, k):
                        extra_keys_on_rhs.remove(k)
            if extra_keys_on_rhs:
                if not s1.additionalProperties:
                    print_db("__05__")
                    return False
                else:
                    for k in extra_keys_on_rhs:
                        if not s1.additionalProperties.isSubtype(
                                s2.properties[k]):
                            print_db("__06__")
                            return False

            extra_patterns_on_rhs = set(
                s2.patternProperties.keys()).difference(
                    s1.patternProperties.keys())
            for k in extra_patterns_on_rhs.copy():
                for k_ in s1.patternProperties.keys():
                    if utils.regex_isSubset(k, k_):
                        extra_patterns_on_rhs.remove(k)
            if extra_patterns_on_rhs:
                if not s1.additionalProperties:
                    print_db("__07__")
                    return False
                else:
                    for k in extra_patterns_on_rhs:
                        if not s1.additionalProperties.isSubtype(
                                s2.patternProperties[k]):
                            try:  # means regex k is infinite
                                parse(k).cardinality()
                            except OverflowError:
                                print_db("__08__")
                                return False
            #
            # missing_props_from_lhs = set(
            #     s2.properties.keys()) - set(s1.properties.keys())
            # for k in missing_props_from_lhs:
            #     for k_ in s1.patternProperties.keys():
            #         if utils.regex_matches_string(k_, k):
            #             if not s1.patternProperties[k_].isSubtype(s2.properties[k]):
            #                 return False

            # Now, lhs has a patternProperty which is subtype of a property on the rhs.
            # Idealy, at this point, I'd like to check that EVERY property matched by
            # this pattern also exist on the rhs.
            # from greenery.lego import parse
            # p = parse(k_)
            # try:
            # p.cardinality

            # first, matching properties should be subtype pairwise
            unmatched_lhs_props_keys = set(s1.properties.keys())
            for k in s1.properties.keys():
                if k in s2.properties.keys():
                    unmatched_lhs_props_keys.discard(k)
                    if not s1.properties[k].isSubtype(s2.properties[k]):
                        return False
                # for the remaining keys, make sure they either don't exist
                # in rhs or if they, then their schemas should be sub-type
                else:
                    for k_ in s2.patternProperties:
                        # if utils.regex_isSubset(k, k_):
                        if utils.regex_matches_string(k_, k):
                            unmatched_lhs_props_keys.discard(k)
                            if not s1.properties[k].isSubtype(
                                    s2.patternProperties[k_]):
                                return False

            # second, matching patternProperties should be subtype pairwise
            unmatched_lhs_pProps_keys = set(s1.patternProperties.keys())
            for k in s1.patternProperties.keys():
                for k_ in s2.patternProperties.keys():
                    if utils.regex_isSubset(k_, k):
                        unmatched_lhs_pProps_keys.discard(k)
                        if not s1.patternProperties[k].isSubtype(
                                s2.patternProperties[k_]):
                            return False
            # third,

            # fourth,
            if s2.additionalProperties == True:
                return True
            elif s2.additionalProperties == False:
                if s1.additionalProperties == True:
                    return False
                elif unmatched_lhs_props_keys or unmatched_lhs_pProps_keys:
                    return False
                else:
                    return True
            else:
                for k in unmatched_lhs_props_keys:
                    if not s1.properties[k].isSubtype(s2.additionalProperties):
                        return False
                for k in unmatched_lhs_pProps_keys:
                    if not s1.patternProperties[k].isSubtype(
                            s2.additionalProperties):
                        return False
                if s1.additionalProperties == True:
                    return False
                elif s1.additionalProperties == False:
                    return True
                else:
                    return s1.additionalProperties.isSubtype(
                        s2.additionalProperties)
Example #60
0
def test_equivalence():
	assert parse("aa*").equivalent(parse("a*a"))
	assert parse("([ab]*a|[bc]*c)?b*").equivalent(parse("b*(a[ab]*|c[bc]*)?"))