def __init__(self, grammar_file):
     '''
 GrammarEngine.grammar returns a Grammar object 
 to get the actual dictionary, do GrammarEngine.grammar.grammar
 '''
     self.grammar = GrammarEngine(grammar_file).grammar.grammar
     self.verbose = False
Beispiel #2
0
    def __init__(self, grammar):
        self.string = ""
        self.words = []
        self.engine = GrammarEngine(grammar)
        self.RDP = RecursiveDescentParser(self.engine, False)

        self.nonterminals = self.engine.get_nonterminals(grammar)
        self.terminals = self.engine.get_terminals(grammar)
Beispiel #3
0
def component4():
    grammar_engine = GrammarEngine("component4.txt")
    grammar = grammar_engine.grammar
    generated_texts = []

    for i in range(1):
        text = grammar_engine.generate("origin")
        generated_texts.append(text)

    final_sentence = " ".join(generated_texts)
    print(final_sentence)
    parser = IslandParser(grammar)
    result = parser.parse(final_sentence)
    for element in result:
        substrings = element.split()
        print(substrings[0][1:] + ":" + substrings[1][:-1])
def change_verb(doc):
  '''
  Find the synonym of the verb
  '''
  verb = find_verb(doc)
  grammar = GrammarEngine("verb.txt").grammar
  rule = grammar.find(verb).rules
  index = random.randint(0, len(rule)-1)
  new_verb = str(rule[index])[7:-3]
  new_sentence = ""
  for word in doc:
    if word.text == verb:
      new_sentence = new_sentence + " " + new_verb
    elif len(new_sentence) > 0:
      new_sentence = new_sentence + " " + word.text
    else:
      new_sentence = word.text
  return new_sentence
Beispiel #5
0
def component2():
    grammar_engine = GrammarEngine("component2.txt")
    grammar = grammar_engine.grammar
    parser = RecursiveDescentParser(grammar, False)
    result = parser.parse("Joe said Buster ghosted Schiller", "S")

    ##keys = grammar.keys()
    # for symbol in grammar:
    #   rules[symbol] = grammar[symbol].body
    # print(rules)
    # print(grammar.variables)
    print(result)
Beispiel #6
0
        while length >= 1:
            start = 0
            while start <= len(fragments) - length:
                # get the substring
                substring = " ".join(fragments[start:start + length])
                # parse the substring
                for symbol in self.grammar.grammar.keys(
                ):  # try out all symbols
                    result = self.parser.parse(string=substring,
                                               start_symbol_name=symbol)
                    parse_already_exists = False
                    # if the result is not None, that means there was a successful parse
                    if result != None:
                        # check if the result is a subset of some other partial parse
                        for parse in self.partial_parses:
                            if result in parse:
                                parse_already_exists = True
                        # if it's not a subset of any of the existing parses, add it to the list
                        if not parse_already_exists:
                            self.partial_parses.append(result)
                # move the window
                start += 1
            # decrease the length of the expected substring
            length -= 1


if __name__ == "__main__":
    grammar = GrammarEngine("component4.txt").grammar
    parser = IslandParser(grammar=grammar, verbose=False)
    string = "Hello, I am Yemi, a senior CS  major. I live in Cassat. A usual day looks like this for me: I wake up at 11  and go to work  after having ramen  for breakfast. After that, I drop my stuff in Cassat, have lunch in LDC, and then work in Little Joy."
    print(parser.parse(string=string))
Beispiel #7
0
class IslandParser:
    def __init__(self, grammar):
        self.string = ""
        self.words = []
        self.engine = GrammarEngine(grammar)
        self.RDP = RecursiveDescentParser(self.engine, False)

        self.nonterminals = self.engine.get_nonterminals(grammar)
        self.terminals = self.engine.get_terminals(grammar)

    '''
    Can be given a list of nonterminals (if not, uses default)
    Returns a tuple of island parses, each of which is a list of partial parses of the object's words. 

    Starting at the number of tokens, n, in the string it wants to parse and successively decreasing by 1, looks at all the n-length substrings of the string. Attempts to parse each one. If able, it adds it to a partial parse, then continues on. 
  '''

    def parse(self, string):
        # self.string = string + " "
        # string = string + " ENDS"
        puncts = [",", ".", "!"]
        # words = string.split(" ")
        self.words = re.findall(r"[\w']+|[.,!?;]", string)

        #False until it finds the largest thing it can parse
        biggest_parse = False
        #the partial parses - ends up being a list of lists. Each inner list is one partial parse
        partial_parses = []
        #the tokens that make up each partial parse - a list of lists
        partial_parses_tokens = []
        #the indices of the tokens that make up each partial parse - a list of lists
        partial_parses_indices = []

        symbols = self.nonterminals
        for i in range(len(self.words), 1, -1):
            # print(self.words)
            token_lists = self.substringsFilterNotInGrammar(self.words, i)
            # print(token_lists)
            biggest_parse_this_level = False
            for token_and_indices in token_lists:
                token = token_and_indices[0]
                # print("i", i)
                # print(token)
                indices = token_and_indices[1]
                for symbol in symbols:
                    # print(symbol)
                    parse = self.RDP.parse(token, symbol)
                    # print(parse)
                    if parse != None:
                        if not biggest_parse:
                            temp = [parse]
                            partial_parses.append(temp)
                            # partial_parses_tokens.append(token.split(" "))
                            partial_parses_tokens.append(
                                re.findall(r"[\w']+|[.,!?;]", token))
                            partial_parses_indices.append(indices)
                            biggest_parse_this_level = True
                        else:
                            # little_tokens = token.split(" ")
                            little_tokens = re.findall(r"[\w']+|[.,!?;]",
                                                       token)
                            parse_num = 0
                            for par in partial_parses_tokens:
                                new_tokens = True
                                for partial in par:
                                    for index in indices:
                                        if index in partial_parses_indices[
                                                parse_num]:
                                            new_tokens = False
                                if new_tokens:
                                    # new_tokens_list = token.split(" ")
                                    new_tokens_list = re.findall(
                                        r"[\w']+|[.,!?;]", token)
                                    temp_parses = partial_parses[
                                        parse_num].copy()
                                    temp_parses.append(parse)
                                    temp_parses_tokens = partial_parses_tokens[
                                        parse_num].copy()
                                    temp_parses_indices = partial_parses_indices[
                                        parse_num].copy()
                                    for x in new_tokens_list:
                                        temp_parses_tokens.append(x)
                                    for index in indices:
                                        temp_parses_indices.append(index)
                                    partial_parses.append(temp_parses)
                                    partial_parses_tokens.append(
                                        temp_parses_tokens)
                                    partial_parses_indices.append(
                                        temp_parses_indices)
                                parse_num = parse_num + 1

                #if all tokens have been parsed
                all_tokens_parses = []
                x = 0
                all_parsed = False
                for tokensList in partial_parses_tokens:
                    if len(tokensList) == len(self.words):
                        all_tokens_parses.append(partial_parses[x])
                        all_parsed = True
                    x += 1
                if all_parsed:
                    final_parses = []
                    x = 0
                    minParses = min(len(parse) for parse in all_tokens_parses)
                    for parse in all_tokens_parses:
                        if len(parse) == minParses:
                            final_parses.append(parse)
                        x += 1
                    # print("Done early")
                    if len(final_parses) > 1:
                        largest_length = -float('inf')
                        largest_parse = []
                        for final_parse in final_parses:
                            if final_parse[0].count("(") > largest_length:
                                largest_length = final_parse[0].count("(")
                        for final_parse in final_parses:
                            if final_parse[0].count("(") == largest_length:
                                largest_parse.append(final_parse)
                        return tuple(largest_parse), True
                    return final_parses, True

                if biggest_parse_this_level:
                    biggest_parse = True

        if partial_parses == []:
            # print("There are no possible partial parses for the given string and grammar. Please ensure that there is white space around each token")
            return (), False
        #do something to choose which partial parses to return
        #only consider parses with the most tokens parsed
        maxTokens = max(len(parse) for parse in partial_parses_tokens)
        # print("MAX TOKENS", maxTokens)
        parse_num = 0
        pre_final_parses = []
        for parse in partial_parses_tokens:
            if len(parse) == maxTokens:
                pre_final_parses.append(partial_parses[parse_num])
            parse_num += 1

        #only consider parses from the previous set that have the minimum islands
        final_parses = []
        parse_num = 0
        minParses = min(len(parse) for parse in pre_final_parses)
        for parse in pre_final_parses:
            if len(parse) == minParses:
                final_parses.append(pre_final_parses[parse_num])
            parse_num += 1

        if len(final_parses) > 1:
            largest_length = -float('inf')
            largest_parse = []
            for final_parse in final_parses:
                if final_parse[0].count("(") > largest_length:
                    largest_length = final_parse[0].count("(")
            for final_parse in final_parses:
                if final_parse[0].count("(") == largest_length:
                    largest_parse.append(final_parse)
            return tuple(largest_parse), True
        return tuple(final_parses), True

    #only returns strings of length substring_length that have words that are in the grammar's terminals
    #returns a list of tuples, where each tuple contains the token phrase and a list of indices of each token in the phrase ("phrase", [indices])
    def substringsFilterNotInGrammar(self, words, substring_length):
        terminals = []
        for terminal in self.terminals:
            for splitted_terminal in terminal.split(" "):
                if "<" not in splitted_terminal:
                    terminals.append(splitted_terminal)
        # print(terminals)
        tokens_and_indices = []
        i = 0
        while i + substring_length <= len(words):
            temp = words[i:i + substring_length]
            add = True
            for word in temp:
                if word not in terminals:
                    add = False
            if add:
                string = " ".join(temp)
                string = re.sub(r' ([^A-Za-z0-9])', r'\1', string)
                indices = []
                for x in range(i, i + substring_length):
                    indices.append(x)
                tokens_and_indices.append((string, indices))
            i += 1
        # print(tokens_and_indices)
        return tokens_and_indices
class ShiftReduceParser2:
    def __init__(self, grammar_file):
        '''
    GrammarEngine.grammar returns a Grammar object 
    to get the actual dictionary, do GrammarEngine.grammar.grammar
    '''
        self.grammar = GrammarEngine(grammar_file).grammar.grammar
        self.verbose = False

    def parse(self, string, verbose=False):
        self.verbose = verbose

        stack = []
        remaining = string.split()

        # if there are tokens remaining to be parsed
        while len(remaining) > 0:
            token = remaining[0]
            remaining = remaining[1:]
            self.shift(stack, token)
            self.reduce(stack)

        # if you've exhausted all tokens but still need to reduce
        while len(stack) != 1:
            self.reduce(stack)

        return stack.pop()

    def shift(self, stack, token):
        if self.verbose:
            print(f"Appending '{token}'")
        stack.append(token)

    def reduce(self, stack):
        '''
    rules = {
        S -> <NP> <VP>
        NP -> <Det> <Nom> | <PropN>
        Nom -> <Adj> <Nom> | <N>
        VP -> <V> <NP> | <V> <S> | <V> <NP> <PP>
        PP -> <P> <NP>
        PropN -> Buster | Schiller | Joe
        Det -> the | a | every
        N -> bear | squirrel | tree | fish | log
        Adj -> angry | frightened | little | tall
        V -> chased | saw | dodged | loved | ghosted | said
        P -> under | over | near
    }

    Progression:

    ["Buster"]
    [(PropN "Buster")]
    [(NP (PropN "Buster"))]
    [(NP (PropN "Buster")), "ghosted"]
    [(NP (PropN "Buster")), (V "ghosted")]
    [(NP (PropN "Buster")), (V "ghosted"), "a"]
    [(NP (PropN "Buster")), (V "ghosted"), (DET "a")]
    [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), "little"]
    [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little")]
    [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little"), "tree"]
    [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little"), "tree"]
    [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little"), (N "tree")]
    [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little"), (Nom (N "tree"))]
    [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (Nom (ADJ "little") (Nom (N "tree")))]
    [(NP (PropN "Buster")), (V "ghosted"), (NP (DET "a") (Nom (ADJ "little") (Nom (N "tree"))))]
    [(NP (PropN "Buster")), (VP (V "ghosted") (NP (DET "a") (Nom (ADJ "little") (Nom (N "tree")))))]
    [(S (NP (PropN "Buster")) (VP (V "ghosted") (NP (DET "a") (Nom (ADJ "little") (Nom (N "tree"))))))]

    return S 

    Pop an item 

    if it's a string, reduce it to a nonterminal as much as possible, each time creating a new node, subordinating the previous value as its child, push the node back into the stack

    if it's a nonterminal 
    while stack is not empty
      pop it off, save it,
      check if it matches any of the terminal symbols' rules
      if not, pop another nonterminal symbol, concatenate it to the previous nonterminal in the correct order in the expected format with white space between things, see if this grammar rule matches any of the terminal symbol's rules.

      if there is a match, create a new node where the child is matching list of nonterminal, push this node back onto the stack 

      self.grammar is a dictionary of nonterminal symbols
    '''
        token = stack.pop()
        used_token = False

        # if it's a string, reduce it to a nonterminal
        if type(token) == str:
            for nonterminal in self.grammar.keys():
                nonterminal_symbol_obj = self.grammar[nonterminal]
                for rule in nonterminal_symbol_obj.rules:
                    if [token] == rule.body:
                        new_node = Node(symbol=nonterminal_symbol_obj,
                                        child=Node(token))
                        stack.append(new_node)
                        used_token = True
                        if self.verbose:
                            print(
                                f"Appending new node '{new_node.symbol.name}' from terminal symbol '{token}'"
                            )
                            print("stack looks like:")
                            print(stack)

        # if the popped token was not used because it was a nonterminal symbol, push it back in
        if not used_token:
            if self.verbose:
                print(
                    f"Because {token.symbol} was not used, appended back on the stack."
                )
            stack.append(token)

        # if it's a nonterminal, combine nonterminals if necessary to reduce
        token_list = []  # list containing nonterminal symbol objects
        token_list_to_be_modified = []  # list containing nodes
        reduced = False
        while len(stack) > 0 and not reduced:
            # if the token_list already contains something, add an white space before adding a new token to account for white space
            if len(token_list) != 0:
                token_list.append(' ')
            node = stack.pop()
            token_list.append(node.symbol)  # [obj(NP), obj(VP)]
            token_list_to_be_modified.append(
                node)  # [Node(obj(NP)), Node(obj(VP))]
            # reverse the list because due to the property of "popping" in the opposite order, you need to correct the order
            token_list.reverse()
            token_list_to_be_modified.reverse()
            if self.verbose:
                print(f"Trying out production rule: {token_list}")

            # check if there's a matching rule to this token_list
            for nonterminal in self.grammar.keys():
                nonterminal_symbol_obj = self.grammar[nonterminal]
                for rule in nonterminal_symbol_obj.rules:
                    if token_list == rule.body:
                        # link the nodes that are adjacent to one another
                        for i in range(len(token_list_to_be_modified) - 1):
                            token_list_to_be_modified[
                                i].next = token_list_to_be_modified[i + 1]
                        new_node = Node(symbol=nonterminal_symbol_obj,
                                        child=token_list_to_be_modified[0])
                        stack.append(new_node)
                        reduced = True
                        if self.verbose:
                            print(
                                f"Appending new node '{new_node.symbol.name}' from nonterminal symbol '{nonterminal_symbol_obj.name}'"
                            )
                        break
                # tiny little optimization to stop it from going allll they way down the list
                if reduced:
                    break

            # if there was a reduction, clear the intermediate lists, and move on
            if reduced:
                if self.verbose:
                    print("Reduced")
                token_list = []
                token_list_to_be_modified = []

            # if there wasn't a reduction, and you still have stuff in the stack, pop another item and add it to the token_list
            elif not reduced and len(stack) > 0:
                if self.verbose:
                    print("It was not reduced but trying again")
                    print("stack looks like:")
                    print(stack)
                token_list.reverse(
                )  # you need to re-reverse it so that when you add a new item it won't be all jacked up
                token_list_to_be_modified.reverse()

            # if there was not a reduction, and there are no more things to pop off the stack,
            else:
                for node in token_list_to_be_modified:
                    stack.append(node)
                reduced = True
                if self.verbose:
                    print(
                        "It was not reduced and there are no more things to pop off the stack, so quitting, while restoring stack to original condition"
                    )
                    print("stack looks like:")
                    print(stack)

    def print_tree(self, root):
        if type(root.symbol) == str:
            return root.symbol
        else:
            string = ""

            if root.child != None:
                string += "(" + root.symbol.name + " " + self.print_tree(
                    root.child) + ")"

            if root.next != None:
                string += self.print_tree(root.next)

            return string