Example #1
0
    def firsts(self):  # CONST
        """
        Exceptions
        --------------
            1. The grammar has left recursion.
        """
        def first_var(v):
            """Calculate the first of a variable using Dynamic Programming."""
            if len(first[v]) != 0:
                return first[v]

            for prod in self.rules[v]:
                for p in prod:
                    to_add = first_var(p)
                    first[v].update(to_add)
                    # Stop when Xi doesn't have epsilon in its first
                    if "&" not in to_add:
                        first[v].discard("&")
                        break
            return first[v]

        if self.has_left_recursion():
            raise RuntimeError(
                "A grammar can't contain left recursion in ordered to calculate FIRSTS."
            )

        first = {
            t: OrderedSet([t])
            for t in OrderedSet(['&']) | self.terminals
        }
        first.update({v: OrderedSet() for v in self.variables})

        for v in self.variables:
            first_var(v)
        return first
Example #2
0
        def first_first():
            nonlocal conflict_terminal
            cached_first = self.firsts()
            # Search for non determinism
            total = OrderedSet()
            for prod in self.rules[v]:
                for ter in self.first_body(prod, cached_first):
                    if ter in total and conflict_terminal is None:
                        conflict_terminal = ter
                        break
                    total.add(ter)

            if conflict_terminal:
                has_non_determinism = True
                # print("\n\nGRAMMAR Before")
                # print(self)

                # Expose indirect
                expose_indirect_ndet(conflict_terminal)
                # print("\n\nGRAMMAR After Substitutions")
                # print(self)

                # Direct
                lcp = os.path.commonprefix([
                    prod for prod in self.rules[v]
                    if prod[0] == conflict_terminal
                ])
                create_new_var_lcp(lcp)
                # print("\n\nGRAMMAR After Eliminating Direct")
                # print(self)
                return True
            return False
Example #3
0
 def create_new_v(sym):
     nonlocal new_v_id
     new_v_id += 1
     new_v = "❬R{}❭".format(new_v_id)
     to_add_var.add(new_v)
     self.rules[new_v] = OrderedSet([(sym, )])
     new_rules[new_v] = OrderedSet([(sym, )])
     return new_v
Example #4
0
    def remove_left_recursion(self):  # NOT CONST
        """
        Exceptions
        --------------
            1. self is not e-free.
            2. self is not cycle-free.

        Notes
        -----
            Be aware when Aj has no productions.
            While removing direct recursion, one may end up with len(prods_i) == 0, since
            there may be S => Sbeta, where beta has no productions.

        """
        if self.has_e():
            raise RuntimeError(
                "A grammar must be &-free in order to remove left recursions.")

        if self.has_cycle():
            raise RuntimeError(
                "A grammar must be cycle-free in order to remove left recursions."
            )

        # Remove indirect
        for i in range(len(self.variables)):
            direct = False
            for j in range(i):
                to_remove = set()
                for production in self.rules[self.variables[i]]:
                    if self.variables[j] == production[0]:
                        alpha = production[1:]
                        to_remove.add(production)
                        for beta in self.rules[self.variables[j]]:
                            self.rules[self.variables[i]].add(beta + alpha)

                for rem in to_remove:
                    self.rules[self.variables[i]].discard(rem)

            for production in self.rules[self.variables[i]]:
                if production[0] == self.variables[i]:
                    direct = True

            if direct:
                new_var = "❬{}'❭".format(self.variables[i])
                self.variables.add(new_var)
                self.rules[new_var] = OrderedSet()
                new_prods_i = OrderedSet()
                for production in self.rules[self.variables[i]]:
                    if production[0] == self.variables[i]:
                        self.rules[new_var].add(production[1:] + (new_var, ))
                    else:
                        new_prods_i.add(production + (new_var, ))

                self.rules[self.variables[i]] = new_prods_i
                self.rules[new_var].add(('&', ))
        self.CHECK_GRAMMAR()
Example #5
0
 def __init__(self, table: dict):
     """
     Pre-conditions
     --------------
         1. table is a valid LL(1) table. That is, it has no conflicts and it's
            not left recursive.
     """
     self.variables = OrderedSet([e[0] for e in table.keys()])
     self.terminals = OrderedSet([e[1] for e in table.keys()]) # Contains $
     self.table = table
Example #6
0
        def create_new_var_lcp(lcp):
            nonlocal new_var_id
            new_rules_old_v = OrderedSet()
            new_var_id = new_var_id + 1
            if len(v) == 1:
                new_var = "❬{},{}❭".format(v, new_var_id)
            else:
                new_var = "❬{},{}❭".format(v[1], new_var_id)
            self.rules[new_var] = OrderedSet()
            self.variables.add(new_var)

            # Factor
            ll = len(lcp)
            if lcp == ("&", ):
                new_rules_old_v.add((new_var, ))
            else:
                new_rules_old_v.add(lcp + (new_var, ))

            # Add prods to factored variable
            for prod in self.rules[v]:
                if prod[0] == conflict_terminal:
                    if len(prod[ll:]) != 0:
                        self.rules[new_var].add(prod[ll:])
                    else:
                        self.rules[new_var].add(("&", ))
                else:
                    new_rules_old_v.add(prod)

            self.rules[v] = new_rules_old_v
Example #7
0
    def replace_terminals(self):  # NOT CONST
        """
        Pre_conditions: None
        """
        new_v_id = 0

        # var_to_terminal checks if variable already exists or if variable has already been created
        # var_to_terminal does not consider already duplicated variables
        def var_to_terminal(sym):
            var = None
            for v in self.variables:
                if len(self.rules[v]) == 1 and self.rules[v][0] == (sym, ):
                    var = v
                    return var
            for v in to_add_var:
                if len(self.rules[v]) == 1 and self.rules[v][0] == (sym, ):
                    var = v
                    return var
            return var

        # create_new_v creates new variable
        def create_new_v(sym):
            nonlocal new_v_id
            new_v_id += 1
            new_v = "❬R{}❭".format(new_v_id)
            to_add_var.add(new_v)
            self.rules[new_v] = OrderedSet([(sym, )])
            new_rules[new_v] = OrderedSet([(sym, )])
            return new_v

        to_add_var = OrderedSet()
        new_rules = dict()
        for v in self.variables:
            to_add = OrderedSet()
            for prod in self.rules[v]:
                old_prod = list(prod)
                if (len(old_prod) >= 2):
                    for i in range(len(old_prod)):
                        symbol = old_prod[i]
                        if symbol in self.terminals:
                            new_v = var_to_terminal(symbol)
                            if new_v is None:
                                new_v = create_new_v(symbol)
                            for j in range(i, len(old_prod)):
                                if old_prod[j] == symbol:
                                    old_prod[j] = new_v
                to_add.add(tuple(old_prod))
            new_rules[v] = to_add

        self.rules = new_rules
        for v in to_add_var:
            self.variables.add(v)
        self.CHECK_GRAMMAR()
Example #8
0
    def follows(self):  # CONST
        """Compute the follows."""
        first = self.firsts()
        follow = {v: OrderedSet() for v in self.variables}

        follow[self.start].add("$")
        add = True
        while (add):
            add = False
            for head, bodies in self.rules.items():
                for body in bodies:
                    # Add FIRSTS
                    lb = len(body)
                    for i in range(lb - 1):
                        if body[i] in self.variables:
                            to_add = self.first_body(body[i + 1:], first)
                            to_add.discard("&")
                            if not (to_add <= follow[body[i]]):
                                add = True
                                follow[body[i]].update(to_add)
                    # Add FOLLOWS
                    to_add = follow[head]
                    to_add.discard("&")
                    for i in range(lb - 1, -1, -1):
                        if body[i] in self.variables:
                            if not to_add.issubset(follow[body[i]]):
                                add = True
                                follow[body[i]].update(to_add)
                            if "&" not in first[body[i]]:
                                break
                        else:
                            break
        return follow
Example #9
0
 def sub_var(prod):
     to_add = OrderedSet()
     for prod_var in self.rules[prod[0]]:
         if prod_var == ("&", ):
             if len(prod[1:]) == 0:
                 to_add.add(("&", ))
             else:
                 to_add.add(prod[1:])
         else:
             to_add.add(prod_var + prod[1:])
     return to_add
Example #10
0
    def has_left_recursion(self):  # CONST
        # (A, B) is an edge iff A => B𝛼 is a rule in rules
        edges = {v: OrderedSet() for v in self.variables}
        for v in self.variables:
            for prod in self.rules[v]:
                if prod[0] in self.variables:
                    edges[v].add(prod[0])

        graph = Graph(self.variables, edges)
        return any([graph.has_loop(v) for v in self.variables])
Example #11
0
 def first_follow():
     new_rules_old_v = OrderedSet()
     to_discard = None
     cached_firsts = self.firsts()
     for prod in self.rules[v]:
         lp = len(prod)
         for i in range(lp - 1):
             # Test if any variable Xi appearing in X1X2...Xn has a Fi/Fo conflict
             # That is, ε in FIRST(Xi) AND FIRST(Xi) ∩ FIRST(Xi+1...Xn) ≠ Ø
             if prod[i] in self.variables:
                 intersection = cached_firsts[
                     prod[i]] & self.first_body(prod[i + 1:],
                                                cached_firsts)
                 intersection.discard("&")
                 if "&" in cached_firsts[
                         prod[i]] and len(intersection) != 0:
                     to_discard = prod
                     for prod_sub in self.rules[prod[i]]:
                         if prod_sub == ("&", ):
                             new_rules_old_v.add(prod[:i] +
                                                 prod[i + 1:])
                         else:
                             new_rules_old_v.add(prod[:i] + prod_sub +
                                                 prod[i + 1:])
                     break
         if not to_discard is None:
             break
     self.rules[v].discard(to_discard)
     self.rules[v].update(new_rules_old_v)
     if not to_discard is None:
         return True
     return False
Example #12
0
        def expose_indirect_ndet(conflict_terminal):
            def sub_var(prod):
                to_add = OrderedSet()
                for prod_var in self.rules[prod[0]]:
                    if prod_var == ("&", ):
                        if len(prod[1:]) == 0:
                            to_add.add(("&", ))
                        else:
                            to_add.add(prod[1:])
                    else:
                        to_add.add(prod_var + prod[1:])
                return to_add

            cached_first = self.firsts()
            substitution_happened = True
            while substitution_happened:
                substitution_happened = False
                new_rules_v = OrderedSet()
                for prod in self.rules[v]:
                    if prod[0] in self.variables and conflict_terminal in self.first_body(
                            prod, cached_first):
                        new_rules_v.update(sub_var(prod))
                        substitution_happened = True
                    else:
                        new_rules_v.add(prod)
                self.rules[v] = new_rules_v
Example #13
0
    def remove_unreachables(self):  # NOT CONST
        """
        Pre_conditions: None
        """
        # (A, B) is an edge iff A => alfa and B is in alfa
        # NOTE: doesn't take care of terminals
        edges = {var: OrderedSet() for var in self.variables}
        for head in self.variables:
            for body in self.rules[head]:
                for symbol in body:
                    if symbol in self.variables:
                        edges[head].add(symbol)

        graph = Graph(self.variables, edges)

        # Remove both the variables that were not visited and their rules
        new_rules = {var: OrderedSet() for var in self.variables}
        visited = graph.bfs(self.start)
        for rem in OrderedSet([v for v in self.variables if not visited[v]]):
            del self.rules[rem]
            self.variables.discard(rem)
        self.CHECK_GRAMMAR()
Example #14
0
    def remove_unit(self):  # NOT CONST
        """
        Post-conditions: may return variable with no production

        Note:
        In this algorithm cyclic productions will be eliminated.
        A -> B                          A ->
        B -> C         -------->        B ->
        C -> A                          C ->

        Our algorithm takes care of cyclic productions
        """
        def not_unit(production):
            return len(production) > 1 or production[0] not in self.variables

        # (A, B) is an edge iff A => B is a rule
        edges = {var: OrderedSet() for var in self.variables}
        for head in self.variables:
            for body in self.rules[head]:
                if len(body) == 1 and body[0] in self.variables:
                    edges[head].add(body[0])

        graph = Graph(self.variables, edges)

        # Expand all reacheable unit productions
        # NOTE: Each variable visits itself in bfs
        new_rules = {var: OrderedSet() for var in self.variables}
        for var in self.variables:
            visited = graph.bfs(var)
            for contender in self.variables:
                if visited[contender]:
                    for prod in self.rules[contender]:
                        if not_unit(prod):
                            new_rules[var].add(prod)
        self.rules = new_rules
        self.CHECK_GRAMMAR()
Example #15
0
    def reduce_size(self):  # NOT CONST
        # reduce_size does not check if new_v was already in the grammar

        new_rules = dict()
        to_add_var = OrderedSet()
        number_v = -1
        new_v_id = 0

        for v in self.variables:
            number_v += 1
            new_v_id = 0
            new_rules[v] = OrderedSet()
            for prod in self.rules[v]:
                lp = len(prod)
                if lp > 2:
                    new_v = "❬C({},{})❭".format(number_v, new_v_id)
                    to_add_var.add(new_v)
                    new_v_id += 1
                    new_rules[new_v] = OrderedSet([(prod[lp - 2], prod[lp - 1])
                                                   ])

                    for i in range(lp - 3, 0, -1):
                        old_v = new_v
                        new_v = "❬C({},{})❭".format(number_v, new_v_id)
                        to_add_var.add(new_v)
                        new_rules[new_v] = OrderedSet([(prod[i], old_v)])
                        new_v_id += 1

                    new_rules[v].update(OrderedSet([(prod[0], new_v)]))
                else:
                    new_rules[v].add(prod)

        self.rules = new_rules
        for v in to_add_var:
            self.variables.add(v)
        self.CHECK_GRAMMAR()
Example #16
0
    def has_cycle(self):  # CONST
        # (A, B) is an edge iff A => B is a rule
        edges = {var: OrderedSet() for var in self.variables}
        for head in self.variables:
            for body in self.rules[head]:
                if len(body) == 1 and body[0] in self.variables:
                    edges[head].add(body[0])

        graph = Graph(self.variables, edges)

        # Check if some V appears in B = > V, where B was visited by bfs(V)
        for var in self.variables:
            visited = graph.bfs(var)
            for contender in self.variables:
                if visited[contender]:
                    for prod in self.rules[contender]:
                        if prod == (var, ):
                            return True
        return False
Example #17
0
        def power_set(i, cuts):
            """Return the all possible cuts obtained by striking out a subset
               of the nullable variables in the given production.
            """
            if i == len(cuts[0]):
                new_ps = OrderedSet()
                for c in cuts:
                    new_c = tuple()
                    for c2 in c:
                        if c2 != "ε":
                            new_c += (c2, )
                    if len(new_c) > 0:
                        new_ps.add(new_c)
                return new_ps

            if cuts[0][i] in nullables:
                to_add = OrderedSet()
                for s in cuts:
                    to_add.add(s[:i] + ("ε", ) + s[i + 1:])
                cuts.update(to_add)
            return power_set(i + 1, cuts)
Example #18
0
    def first_body(self, body, first=None):  # CONST
        """Calculate the first of a syntactical form."""
        lb = len(body)
        if first is None:
            first = self.firsts()

        total = OrderedSet()
        for body in body:
            to_add = first[body]
            total.update(to_add)
            if "&" not in to_add:
                total.discard("&")
                break
        return total
Example #19
0
    def remove_epsilon(self):  # NOT CONST
        def power_set(i, cuts):
            """Return the all possible cuts obtained by striking out a subset
               of the nullable variables in the given production.
            """
            if i == len(cuts[0]):
                new_ps = OrderedSet()
                for c in cuts:
                    new_c = tuple()
                    for c2 in c:
                        if c2 != "ε":
                            new_c += (c2, )
                    if len(new_c) > 0:
                        new_ps.add(new_c)
                return new_ps

            if cuts[0][i] in nullables:
                to_add = OrderedSet()
                for s in cuts:
                    to_add.add(s[:i] + ("ε", ) + s[i + 1:])
                cuts.update(to_add)
            return power_set(i + 1, cuts)

        nullables = OrderedSet()
        nullables.add("&")

        # Find nullables through Hopcroft's algorithm
        changed = True
        while changed:
            changed = False
            for var, prods in self.rules.items():
                for prod in prods:
                    if all([p in nullables
                            for p in prod]) and var not in nullables:
                        nullables.add(var)
                        changed = True

        # Strike out nullables
        for var in self.variables:
            to_add = OrderedSet()
            for prod in self.rules[var]:
                to_add.update(power_set(0, OrderedSet([prod])))
            self.rules[var].update(to_add)
            if ("&", ) in self.rules[var]:
                self.rules[var].discard(("&", ))

        if self.start in nullables:
            self.variables.add("❬'{}❭".format(self.start))
            self.rules["❬'{}❭".format(self.start)] = OrderedSet([
                (self.start, ), ("&", )
            ])
            self.start = "❬'{}❭".format(self.start)
        self.CHECK_GRAMMAR()
Example #20
0
class ContextFreeGrammar:
    def __init__(self, filename: str):
        """
        Pre-conditions
        --------------
            1. filename doesn't name a .cfg file inside cfgs/
            2. The file is an invalid grammar according to our context-free grammar specification (see spec.cfg)

        Post-conditions
        ---------------
            1. rules is properly tokenized using the three categories: terminal,
               uppercase-variable, and brackets-variable.
            2. variables is an OrderedSet with len > 0, where each entry is valid var
            3. terminals is an OrderedSet, where each entry is a valid term (lower-case, len = 1)
            4. rules is a dict where each variable has an OrderedSet entry
            5. start is in variables
            6. & is not in term

        Notes
        -----
            The first grammar that is read, spec.cfg, is assumed to be valid; to confirm this,
            you may run the unit tests.
        """

        filepath = os.path.join(CFGS_DIR, filename)
        assert filepath[-4:] == '.cfg', "Invalid extension"

        self.variables = OrderedSet()
        self.terminals = OrderedSet()
        self.rules = dict()
        self.start = None

        with open(filepath, 'r') as f:
            file_read = f.read()
            ContextFreeGrammar.validate_cfg_word(file_read)
            lines = file_read.split("\n")

            for line in lines:
                if line == lines[-1]:
                    assert len(line) == 0
                    continue

                items = line.split()
                var = items[0]
                self.rules[var] = OrderedSet()
                self.variables.add(var)

                # First iteration
                if self.start is None:
                    self.start = var

                k = 2
                while k < len(items):
                    raw = items[k]

                    # NOTE: is not a method since the pre-conds are not worth testing
                    # Tokenize rule
                    tokenized = []
                    i = 0
                    while i < len(raw):
                        c = raw[i]

                        if c == '❬':  # brackets-variable
                            j = i + 1
                            while raw[j] != '❭':
                                j += 1

                            assert len(raw[i:j + 1]) > 3
                            tokenized.append(raw[i:j + 1])
                            i = j + 1
                        else:  # uppercase-cariable or terminal
                            tokenized.append(c)
                            if c.isupper():
                                pass
                            else:
                                if c != "&":
                                    self.terminals.add(c)
                            i += 1
                    self.rules[var].add(tuple(tokenized))
                    k += 2
        self.CHECK_GRAMMAR()

    def __str__(self) -> str:  # CONST
        string = ""

        first = True
        for rule in self.rules[self.start]:
            if first:
                string += "{} -> {} ".format(self.start, "".join(rule))
                first = False
            else:
                string += "| {} ".format("".join(rule))
        string = string[:-1] + "\n"

        for var in self.variables:
            if var == self.start:
                continue
            first = True
            string += "{} -> ".format(var)
            for rule in self.rules[var]:
                if first:
                    string += "{} ".format("".join(rule))
                    first = False
                else:
                    string += "| {} ".format("".join(rule))
            string = string[:-1] + "\n"
        return string

    def save_to_file(self, filename: str):  # CONST
        filepath = os.path.join(CFGS_DIR, filename)
        with open(filepath, 'w') as f:
            f.write(str(self))

    def CHECK_GRAMMAR(self):  # CONST
        """Temporary method for forcing structure into python."""
        # Assert post-conditions: 2-6.
        assert type(self.variables) == OrderedSet and len(self.variables) > 0 \
            and all([(v.isupper() and len(v) == 1) or (v[0] == '❬' and v[-1] == '❭' and len(v) > 2) for v in self.variables])
        assert type(self.terminals) == OrderedSet and all(
            [(t != '&') and (len(t) == 1) and not (t.isupper())
             for t in self.terminals])
        assert type(self.rules) == dict and self.rules.keys(
        ) == self.variables and all(
            [type(val) == OrderedSet for val in self.rules.values()])
        assert self.start in self.variables

    @staticmethod
    def validate_cfg_word(word: str) -> bool:
        """Check if `word` is a valid .cfg file; the .cfg file format is specified at spec.cfg
        Labels
        ------
            b: |
            e: epsilon
            n: newline
            s: ->
            t: lowercase
            u: uppercase
            o: ❬
            c: ❭
        """
        if VERIFY_GRAMMAR:
            subst = [('\n', 'n'), ('|', 'b'), ('&', 'e'), ('->', 's'),
                     ('❬', 'o'), ('❭', 'c')]
            word2 = ''
            i = 0
            while i < len(word):
                c = word[i]
                if c.isupper():
                    word2 += 'u'
                elif c == '-' and word[i + 1] == '>':
                    word2 += '->'
                    i += 1
                elif c in {'\n', '|', '&', '❬', '❭'}:
                    word2 += c
                elif c == ' ':
                    pass
                else:
                    word2 += 't'
                i += 1
            for k, v in subst:
                word2 = word2.replace(k, v)
            if not SPEC_PARSER.parse(word2):
                raise RuntimeError("This Grammar is not a valid .cfg file")

    def has_e(self):  # CONST
        """Tests if the grammar has &-rules that are not in the start symbol."""
        for v in self.variables:
            for prod in self.rules[v]:
                if v != self.start and prod == ("&", ):
                    return True
        return False

    def has_cycle(self):  # CONST
        # (A, B) is an edge iff A => B is a rule
        edges = {var: OrderedSet() for var in self.variables}
        for head in self.variables:
            for body in self.rules[head]:
                if len(body) == 1 and body[0] in self.variables:
                    edges[head].add(body[0])

        graph = Graph(self.variables, edges)

        # Check if some V appears in B = > V, where B was visited by bfs(V)
        for var in self.variables:
            visited = graph.bfs(var)
            for contender in self.variables:
                if visited[contender]:
                    for prod in self.rules[contender]:
                        if prod == (var, ):
                            return True
        return False

    def remove_left_recursion(self):  # NOT CONST
        """
        Exceptions
        --------------
            1. self is not e-free.
            2. self is not cycle-free.

        Notes
        -----
            Be aware when Aj has no productions.
            While removing direct recursion, one may end up with len(prods_i) == 0, since
            there may be S => Sbeta, where beta has no productions.

        """
        if self.has_e():
            raise RuntimeError(
                "A grammar must be &-free in order to remove left recursions.")

        if self.has_cycle():
            raise RuntimeError(
                "A grammar must be cycle-free in order to remove left recursions."
            )

        # Remove indirect
        for i in range(len(self.variables)):
            direct = False
            for j in range(i):
                to_remove = set()
                for production in self.rules[self.variables[i]]:
                    if self.variables[j] == production[0]:
                        alpha = production[1:]
                        to_remove.add(production)
                        for beta in self.rules[self.variables[j]]:
                            self.rules[self.variables[i]].add(beta + alpha)

                for rem in to_remove:
                    self.rules[self.variables[i]].discard(rem)

            for production in self.rules[self.variables[i]]:
                if production[0] == self.variables[i]:
                    direct = True

            if direct:
                new_var = "❬{}'❭".format(self.variables[i])
                self.variables.add(new_var)
                self.rules[new_var] = OrderedSet()
                new_prods_i = OrderedSet()
                for production in self.rules[self.variables[i]]:
                    if production[0] == self.variables[i]:
                        self.rules[new_var].add(production[1:] + (new_var, ))
                    else:
                        new_prods_i.add(production + (new_var, ))

                self.rules[self.variables[i]] = new_prods_i
                self.rules[new_var].add(('&', ))
        self.CHECK_GRAMMAR()

    def remove_unit(self):  # NOT CONST
        """
        Post-conditions: may return variable with no production

        Note:
        In this algorithm cyclic productions will be eliminated.
        A -> B                          A ->
        B -> C         -------->        B ->
        C -> A                          C ->

        Our algorithm takes care of cyclic productions
        """
        def not_unit(production):
            return len(production) > 1 or production[0] not in self.variables

        # (A, B) is an edge iff A => B is a rule
        edges = {var: OrderedSet() for var in self.variables}
        for head in self.variables:
            for body in self.rules[head]:
                if len(body) == 1 and body[0] in self.variables:
                    edges[head].add(body[0])

        graph = Graph(self.variables, edges)

        # Expand all reacheable unit productions
        # NOTE: Each variable visits itself in bfs
        new_rules = {var: OrderedSet() for var in self.variables}
        for var in self.variables:
            visited = graph.bfs(var)
            for contender in self.variables:
                if visited[contender]:
                    for prod in self.rules[contender]:
                        if not_unit(prod):
                            new_rules[var].add(prod)
        self.rules = new_rules
        self.CHECK_GRAMMAR()

    def remove_epsilon(self):  # NOT CONST
        def power_set(i, cuts):
            """Return the all possible cuts obtained by striking out a subset
               of the nullable variables in the given production.
            """
            if i == len(cuts[0]):
                new_ps = OrderedSet()
                for c in cuts:
                    new_c = tuple()
                    for c2 in c:
                        if c2 != "ε":
                            new_c += (c2, )
                    if len(new_c) > 0:
                        new_ps.add(new_c)
                return new_ps

            if cuts[0][i] in nullables:
                to_add = OrderedSet()
                for s in cuts:
                    to_add.add(s[:i] + ("ε", ) + s[i + 1:])
                cuts.update(to_add)
            return power_set(i + 1, cuts)

        nullables = OrderedSet()
        nullables.add("&")

        # Find nullables through Hopcroft's algorithm
        changed = True
        while changed:
            changed = False
            for var, prods in self.rules.items():
                for prod in prods:
                    if all([p in nullables
                            for p in prod]) and var not in nullables:
                        nullables.add(var)
                        changed = True

        # Strike out nullables
        for var in self.variables:
            to_add = OrderedSet()
            for prod in self.rules[var]:
                to_add.update(power_set(0, OrderedSet([prod])))
            self.rules[var].update(to_add)
            if ("&", ) in self.rules[var]:
                self.rules[var].discard(("&", ))

        if self.start in nullables:
            self.variables.add("❬'{}❭".format(self.start))
            self.rules["❬'{}❭".format(self.start)] = OrderedSet([
                (self.start, ), ("&", )
            ])
            self.start = "❬'{}❭".format(self.start)
        self.CHECK_GRAMMAR()

    def remove_unproductives(self):  # NOT CONST
        """
        Notes
        -----
            If the grammar generates no words (empty language), then the grammar returned will be
                start_symbol -> start_symbol
            and it will keep its terminals.
        """
        productives = OrderedSet()
        for t in self.terminals:
            productives.add(t)
        productives.add("&")

        changed = True
        while changed:
            changed = False
            for var, prods in self.rules.items():
                for prod in prods:
                    if all([p in productives
                            for p in prod]) and var not in productives:
                        productives.add(var)
                        changed = True

        new_rules = dict()
        for v in self.variables:
            new_production = OrderedSet()
            for production in self.rules[v]:
                if all([p in productives for p in production]):
                    new_production.add(production)
            new_rules[v] = new_production

        self.rules = new_rules

        to_remove = OrderedSet()
        for v in self.variables:
            if len(self.rules[v]) == 0:
                to_remove.add(v)

        for rem in to_remove:
            del self.rules[rem]
            self.variables.discard(rem)

        # if empty language S -> S
        if self.start not in self.rules.keys():
            self.variables = OrderedSet()
            self.rules = dict()
            self.variables.add(self.start)
            self.rules[self.start] = OrderedSet()
            self.rules[self.start].add(self.start)
        self.CHECK_GRAMMAR()

    def remove_unreachables(self):  # NOT CONST
        """
        Pre_conditions: None
        """
        # (A, B) is an edge iff A => alfa and B is in alfa
        # NOTE: doesn't take care of terminals
        edges = {var: OrderedSet() for var in self.variables}
        for head in self.variables:
            for body in self.rules[head]:
                for symbol in body:
                    if symbol in self.variables:
                        edges[head].add(symbol)

        graph = Graph(self.variables, edges)

        # Remove both the variables that were not visited and their rules
        new_rules = {var: OrderedSet() for var in self.variables}
        visited = graph.bfs(self.start)
        for rem in OrderedSet([v for v in self.variables if not visited[v]]):
            del self.rules[rem]
            self.variables.discard(rem)
        self.CHECK_GRAMMAR()

    def replace_terminals(self):  # NOT CONST
        """
        Pre_conditions: None
        """
        new_v_id = 0

        # var_to_terminal checks if variable already exists or if variable has already been created
        # var_to_terminal does not consider already duplicated variables
        def var_to_terminal(sym):
            var = None
            for v in self.variables:
                if len(self.rules[v]) == 1 and self.rules[v][0] == (sym, ):
                    var = v
                    return var
            for v in to_add_var:
                if len(self.rules[v]) == 1 and self.rules[v][0] == (sym, ):
                    var = v
                    return var
            return var

        # create_new_v creates new variable
        def create_new_v(sym):
            nonlocal new_v_id
            new_v_id += 1
            new_v = "❬R{}❭".format(new_v_id)
            to_add_var.add(new_v)
            self.rules[new_v] = OrderedSet([(sym, )])
            new_rules[new_v] = OrderedSet([(sym, )])
            return new_v

        to_add_var = OrderedSet()
        new_rules = dict()
        for v in self.variables:
            to_add = OrderedSet()
            for prod in self.rules[v]:
                old_prod = list(prod)
                if (len(old_prod) >= 2):
                    for i in range(len(old_prod)):
                        symbol = old_prod[i]
                        if symbol in self.terminals:
                            new_v = var_to_terminal(symbol)
                            if new_v is None:
                                new_v = create_new_v(symbol)
                            for j in range(i, len(old_prod)):
                                if old_prod[j] == symbol:
                                    old_prod[j] = new_v
                to_add.add(tuple(old_prod))
            new_rules[v] = to_add

        self.rules = new_rules
        for v in to_add_var:
            self.variables.add(v)
        self.CHECK_GRAMMAR()

    def reduce_size(self):  # NOT CONST
        # reduce_size does not check if new_v was already in the grammar

        new_rules = dict()
        to_add_var = OrderedSet()
        number_v = -1
        new_v_id = 0

        for v in self.variables:
            number_v += 1
            new_v_id = 0
            new_rules[v] = OrderedSet()
            for prod in self.rules[v]:
                lp = len(prod)
                if lp > 2:
                    new_v = "❬C({},{})❭".format(number_v, new_v_id)
                    to_add_var.add(new_v)
                    new_v_id += 1
                    new_rules[new_v] = OrderedSet([(prod[lp - 2], prod[lp - 1])
                                                   ])

                    for i in range(lp - 3, 0, -1):
                        old_v = new_v
                        new_v = "❬C({},{})❭".format(number_v, new_v_id)
                        to_add_var.add(new_v)
                        new_rules[new_v] = OrderedSet([(prod[i], old_v)])
                        new_v_id += 1

                    new_rules[v].update(OrderedSet([(prod[0], new_v)]))
                else:
                    new_rules[v].add(prod)

        self.rules = new_rules
        for v in to_add_var:
            self.variables.add(v)
        self.CHECK_GRAMMAR()

    def convert_to_cnf(self):  # NOT CONST
        self.remove_epsilon()
        self.remove_unit()
        self.remove_unproductives()
        self.remove_unreachables()
        self.replace_terminals()
        self.reduce_size()

    def has_left_recursion(self):  # CONST
        # (A, B) is an edge iff A => B𝛼 is a rule in rules
        edges = {v: OrderedSet() for v in self.variables}
        for v in self.variables:
            for prod in self.rules[v]:
                if prod[0] in self.variables:
                    edges[v].add(prod[0])

        graph = Graph(self.variables, edges)
        return any([graph.has_loop(v) for v in self.variables])

    def firsts(self):  # CONST
        """
        Exceptions
        --------------
            1. The grammar has left recursion.
        """
        def first_var(v):
            """Calculate the first of a variable using Dynamic Programming."""
            if len(first[v]) != 0:
                return first[v]

            for prod in self.rules[v]:
                for p in prod:
                    to_add = first_var(p)
                    first[v].update(to_add)
                    # Stop when Xi doesn't have epsilon in its first
                    if "&" not in to_add:
                        first[v].discard("&")
                        break
            return first[v]

        if self.has_left_recursion():
            raise RuntimeError(
                "A grammar can't contain left recursion in ordered to calculate FIRSTS."
            )

        first = {
            t: OrderedSet([t])
            for t in OrderedSet(['&']) | self.terminals
        }
        first.update({v: OrderedSet() for v in self.variables})

        for v in self.variables:
            first_var(v)
        return first

    def first_body(self, body, first=None):  # CONST
        """Calculate the first of a syntactical form."""
        lb = len(body)
        if first is None:
            first = self.firsts()

        total = OrderedSet()
        for body in body:
            to_add = first[body]
            total.update(to_add)
            if "&" not in to_add:
                total.discard("&")
                break
        return total

    def follows(self):  # CONST
        """Compute the follows."""
        first = self.firsts()
        follow = {v: OrderedSet() for v in self.variables}

        follow[self.start].add("$")
        add = True
        while (add):
            add = False
            for head, bodies in self.rules.items():
                for body in bodies:
                    # Add FIRSTS
                    lb = len(body)
                    for i in range(lb - 1):
                        if body[i] in self.variables:
                            to_add = self.first_body(body[i + 1:], first)
                            to_add.discard("&")
                            if not (to_add <= follow[body[i]]):
                                add = True
                                follow[body[i]].update(to_add)
                    # Add FOLLOWS
                    to_add = follow[head]
                    to_add.discard("&")
                    for i in range(lb - 1, -1, -1):
                        if body[i] in self.variables:
                            if not to_add.issubset(follow[body[i]]):
                                add = True
                                follow[body[i]].update(to_add)
                            if "&" not in first[body[i]]:
                                break
                        else:
                            break
        return follow

    def make_LL1_table(self) -> dict():  # CONST
        """Compute LL(1) parsing table.

        Exceptions
        ----------
            1. The grammar does not have left recursion.
            2. The grammar has Fi/Fi or Fi/Fo conflict.
        """
        firsts = self.firsts()
        follows = self.follows()
        table = dict()

        for v in self.variables:
            for alpha in self.rules[v]:
                first_alpha = self.first_body(alpha, firsts)
                # If alpha = &, then skip the loop
                for f in first_alpha - {'&'}:
                    if (v, f) in table.keys():
                        raise RuntimeError("First/First conflict at {}".format(
                            (v, f)))
                    else:
                        table[(v, f)] = alpha
                if "&" in first_alpha:
                    for f in follows[v]:
                        if (v, f) in table.keys():
                            raise RuntimeError(
                                "First/Follow conflict at {}".format(v, f))
                        else:
                            table[(v, f)] = alpha
        return table

    def make_LL1_parser(self) -> PredictiveParser:  # CONST
        """Build LL(1) Predictive Parser for this grammar."""
        return PredictiveParser(self.make_LL1_table())

    def left_factoring(self) -> bool:  # NOT CONST
        """
        Returns
        -------
            True: left factoring succeded

        Exceptions
        --------------
            1. The grammar has left recursion.
        """

        if self.has_left_recursion():
            raise RuntimeError(
                "A grammar can't have left recursion in ordered to be factored"
            )

        new_var_id = 0

        def expose_indirect_ndet(conflict_terminal):
            def sub_var(prod):
                to_add = OrderedSet()
                for prod_var in self.rules[prod[0]]:
                    if prod_var == ("&", ):
                        if len(prod[1:]) == 0:
                            to_add.add(("&", ))
                        else:
                            to_add.add(prod[1:])
                    else:
                        to_add.add(prod_var + prod[1:])
                return to_add

            cached_first = self.firsts()
            substitution_happened = True
            while substitution_happened:
                substitution_happened = False
                new_rules_v = OrderedSet()
                for prod in self.rules[v]:
                    if prod[0] in self.variables and conflict_terminal in self.first_body(
                            prod, cached_first):
                        new_rules_v.update(sub_var(prod))
                        substitution_happened = True
                    else:
                        new_rules_v.add(prod)
                self.rules[v] = new_rules_v

        def create_new_var_lcp(lcp):
            nonlocal new_var_id
            new_rules_old_v = OrderedSet()
            new_var_id = new_var_id + 1
            if len(v) == 1:
                new_var = "❬{},{}❭".format(v, new_var_id)
            else:
                new_var = "❬{},{}❭".format(v[1], new_var_id)
            self.rules[new_var] = OrderedSet()
            self.variables.add(new_var)

            # Factor
            ll = len(lcp)
            if lcp == ("&", ):
                new_rules_old_v.add((new_var, ))
            else:
                new_rules_old_v.add(lcp + (new_var, ))

            # Add prods to factored variable
            for prod in self.rules[v]:
                if prod[0] == conflict_terminal:
                    if len(prod[ll:]) != 0:
                        self.rules[new_var].add(prod[ll:])
                    else:
                        self.rules[new_var].add(("&", ))
                else:
                    new_rules_old_v.add(prod)

            self.rules[v] = new_rules_old_v

        def first_follow():
            new_rules_old_v = OrderedSet()
            to_discard = None
            cached_firsts = self.firsts()
            for prod in self.rules[v]:
                lp = len(prod)
                for i in range(lp - 1):
                    # Test if any variable Xi appearing in X1X2...Xn has a Fi/Fo conflict
                    # That is, ε in FIRST(Xi) AND FIRST(Xi) ∩ FIRST(Xi+1...Xn) ≠ Ø
                    if prod[i] in self.variables:
                        intersection = cached_firsts[
                            prod[i]] & self.first_body(prod[i + 1:],
                                                       cached_firsts)
                        intersection.discard("&")
                        if "&" in cached_firsts[
                                prod[i]] and len(intersection) != 0:
                            to_discard = prod
                            for prod_sub in self.rules[prod[i]]:
                                if prod_sub == ("&", ):
                                    new_rules_old_v.add(prod[:i] +
                                                        prod[i + 1:])
                                else:
                                    new_rules_old_v.add(prod[:i] + prod_sub +
                                                        prod[i + 1:])
                            break
                if not to_discard is None:
                    break
            self.rules[v].discard(to_discard)
            self.rules[v].update(new_rules_old_v)
            if not to_discard is None:
                return True
            return False

        def first_first():
            nonlocal conflict_terminal
            cached_first = self.firsts()
            # Search for non determinism
            total = OrderedSet()
            for prod in self.rules[v]:
                for ter in self.first_body(prod, cached_first):
                    if ter in total and conflict_terminal is None:
                        conflict_terminal = ter
                        break
                    total.add(ter)

            if conflict_terminal:
                has_non_determinism = True
                # print("\n\nGRAMMAR Before")
                # print(self)

                # Expose indirect
                expose_indirect_ndet(conflict_terminal)
                # print("\n\nGRAMMAR After Substitutions")
                # print(self)

                # Direct
                lcp = os.path.commonprefix([
                    prod for prod in self.rules[v]
                    if prod[0] == conflict_terminal
                ])
                create_new_var_lcp(lcp)
                # print("\n\nGRAMMAR After Eliminating Direct")
                # print(self)
                return True
            return False

        for i in range(20):
            has_non_determinism = False

            for v in self.variables:
                conflict_terminal = None

                # Fi/Fo conflict always introduces Fi/Fi conflict
                fi_fo = first_follow()

                # Start another iteration of non-determinism remains
                fi_fi = first_first()
                if fi_fi:
                    has_non_determinism = True
                    break

            if not has_non_determinism:
                # print("Finished in {} step(s)".format(i))
                self.CHECK_GRAMMAR()
                return True
        self.CHECK_GRAMMAR()
        return False
Example #21
0
    def remove_unproductives(self):  # NOT CONST
        """
        Notes
        -----
            If the grammar generates no words (empty language), then the grammar returned will be
                start_symbol -> start_symbol
            and it will keep its terminals.
        """
        productives = OrderedSet()
        for t in self.terminals:
            productives.add(t)
        productives.add("&")

        changed = True
        while changed:
            changed = False
            for var, prods in self.rules.items():
                for prod in prods:
                    if all([p in productives
                            for p in prod]) and var not in productives:
                        productives.add(var)
                        changed = True

        new_rules = dict()
        for v in self.variables:
            new_production = OrderedSet()
            for production in self.rules[v]:
                if all([p in productives for p in production]):
                    new_production.add(production)
            new_rules[v] = new_production

        self.rules = new_rules

        to_remove = OrderedSet()
        for v in self.variables:
            if len(self.rules[v]) == 0:
                to_remove.add(v)

        for rem in to_remove:
            del self.rules[rem]
            self.variables.discard(rem)

        # if empty language S -> S
        if self.start not in self.rules.keys():
            self.variables = OrderedSet()
            self.rules = dict()
            self.variables.add(self.start)
            self.rules[self.start] = OrderedSet()
            self.rules[self.start].add(self.start)
        self.CHECK_GRAMMAR()
Example #22
0
    def __init__(self, filename: str):
        """
        Pre-conditions
        --------------
            1. filename doesn't name a .cfg file inside cfgs/
            2. The file is an invalid grammar according to our context-free grammar specification (see spec.cfg)

        Post-conditions
        ---------------
            1. rules is properly tokenized using the three categories: terminal,
               uppercase-variable, and brackets-variable.
            2. variables is an OrderedSet with len > 0, where each entry is valid var
            3. terminals is an OrderedSet, where each entry is a valid term (lower-case, len = 1)
            4. rules is a dict where each variable has an OrderedSet entry
            5. start is in variables
            6. & is not in term

        Notes
        -----
            The first grammar that is read, spec.cfg, is assumed to be valid; to confirm this,
            you may run the unit tests.
        """

        filepath = os.path.join(CFGS_DIR, filename)
        assert filepath[-4:] == '.cfg', "Invalid extension"

        self.variables = OrderedSet()
        self.terminals = OrderedSet()
        self.rules = dict()
        self.start = None

        with open(filepath, 'r') as f:
            file_read = f.read()
            ContextFreeGrammar.validate_cfg_word(file_read)
            lines = file_read.split("\n")

            for line in lines:
                if line == lines[-1]:
                    assert len(line) == 0
                    continue

                items = line.split()
                var = items[0]
                self.rules[var] = OrderedSet()
                self.variables.add(var)

                # First iteration
                if self.start is None:
                    self.start = var

                k = 2
                while k < len(items):
                    raw = items[k]

                    # NOTE: is not a method since the pre-conds are not worth testing
                    # Tokenize rule
                    tokenized = []
                    i = 0
                    while i < len(raw):
                        c = raw[i]

                        if c == '❬':  # brackets-variable
                            j = i + 1
                            while raw[j] != '❭':
                                j += 1

                            assert len(raw[i:j + 1]) > 3
                            tokenized.append(raw[i:j + 1])
                            i = j + 1
                        else:  # uppercase-cariable or terminal
                            tokenized.append(c)
                            if c.isupper():
                                pass
                            else:
                                if c != "&":
                                    self.terminals.add(c)
                            i += 1
                    self.rules[var].add(tuple(tokenized))
                    k += 2
        self.CHECK_GRAMMAR()