def __init__(self, grammar_file): ''' GrammarEngine.grammar returns a Grammar object to get the actual dictionary, do GrammarEngine.grammar.grammar ''' self.grammar = GrammarEngine(grammar_file).grammar.grammar self.verbose = False
def __init__(self, grammar): self.string = "" self.words = [] self.engine = GrammarEngine(grammar) self.RDP = RecursiveDescentParser(self.engine, False) self.nonterminals = self.engine.get_nonterminals(grammar) self.terminals = self.engine.get_terminals(grammar)
def component4(): grammar_engine = GrammarEngine("component4.txt") grammar = grammar_engine.grammar generated_texts = [] for i in range(1): text = grammar_engine.generate("origin") generated_texts.append(text) final_sentence = " ".join(generated_texts) print(final_sentence) parser = IslandParser(grammar) result = parser.parse(final_sentence) for element in result: substrings = element.split() print(substrings[0][1:] + ":" + substrings[1][:-1])
def change_verb(doc): ''' Find the synonym of the verb ''' verb = find_verb(doc) grammar = GrammarEngine("verb.txt").grammar rule = grammar.find(verb).rules index = random.randint(0, len(rule)-1) new_verb = str(rule[index])[7:-3] new_sentence = "" for word in doc: if word.text == verb: new_sentence = new_sentence + " " + new_verb elif len(new_sentence) > 0: new_sentence = new_sentence + " " + word.text else: new_sentence = word.text return new_sentence
def component2(): grammar_engine = GrammarEngine("component2.txt") grammar = grammar_engine.grammar parser = RecursiveDescentParser(grammar, False) result = parser.parse("Joe said Buster ghosted Schiller", "S") ##keys = grammar.keys() # for symbol in grammar: # rules[symbol] = grammar[symbol].body # print(rules) # print(grammar.variables) print(result)
while length >= 1: start = 0 while start <= len(fragments) - length: # get the substring substring = " ".join(fragments[start:start + length]) # parse the substring for symbol in self.grammar.grammar.keys( ): # try out all symbols result = self.parser.parse(string=substring, start_symbol_name=symbol) parse_already_exists = False # if the result is not None, that means there was a successful parse if result != None: # check if the result is a subset of some other partial parse for parse in self.partial_parses: if result in parse: parse_already_exists = True # if it's not a subset of any of the existing parses, add it to the list if not parse_already_exists: self.partial_parses.append(result) # move the window start += 1 # decrease the length of the expected substring length -= 1 if __name__ == "__main__": grammar = GrammarEngine("component4.txt").grammar parser = IslandParser(grammar=grammar, verbose=False) string = "Hello, I am Yemi, a senior CS major. I live in Cassat. A usual day looks like this for me: I wake up at 11 and go to work after having ramen for breakfast. After that, I drop my stuff in Cassat, have lunch in LDC, and then work in Little Joy." print(parser.parse(string=string))
class IslandParser: def __init__(self, grammar): self.string = "" self.words = [] self.engine = GrammarEngine(grammar) self.RDP = RecursiveDescentParser(self.engine, False) self.nonterminals = self.engine.get_nonterminals(grammar) self.terminals = self.engine.get_terminals(grammar) ''' Can be given a list of nonterminals (if not, uses default) Returns a tuple of island parses, each of which is a list of partial parses of the object's words. Starting at the number of tokens, n, in the string it wants to parse and successively decreasing by 1, looks at all the n-length substrings of the string. Attempts to parse each one. If able, it adds it to a partial parse, then continues on. ''' def parse(self, string): # self.string = string + " " # string = string + " ENDS" puncts = [",", ".", "!"] # words = string.split(" ") self.words = re.findall(r"[\w']+|[.,!?;]", string) #False until it finds the largest thing it can parse biggest_parse = False #the partial parses - ends up being a list of lists. Each inner list is one partial parse partial_parses = [] #the tokens that make up each partial parse - a list of lists partial_parses_tokens = [] #the indices of the tokens that make up each partial parse - a list of lists partial_parses_indices = [] symbols = self.nonterminals for i in range(len(self.words), 1, -1): # print(self.words) token_lists = self.substringsFilterNotInGrammar(self.words, i) # print(token_lists) biggest_parse_this_level = False for token_and_indices in token_lists: token = token_and_indices[0] # print("i", i) # print(token) indices = token_and_indices[1] for symbol in symbols: # print(symbol) parse = self.RDP.parse(token, symbol) # print(parse) if parse != None: if not biggest_parse: temp = [parse] partial_parses.append(temp) # partial_parses_tokens.append(token.split(" ")) partial_parses_tokens.append( re.findall(r"[\w']+|[.,!?;]", token)) partial_parses_indices.append(indices) biggest_parse_this_level = True else: # little_tokens = token.split(" ") little_tokens = re.findall(r"[\w']+|[.,!?;]", token) parse_num = 0 for par in partial_parses_tokens: new_tokens = True for partial in par: for index in indices: if index in partial_parses_indices[ parse_num]: new_tokens = False if new_tokens: # new_tokens_list = token.split(" ") new_tokens_list = re.findall( r"[\w']+|[.,!?;]", token) temp_parses = partial_parses[ parse_num].copy() temp_parses.append(parse) temp_parses_tokens = partial_parses_tokens[ parse_num].copy() temp_parses_indices = partial_parses_indices[ parse_num].copy() for x in new_tokens_list: temp_parses_tokens.append(x) for index in indices: temp_parses_indices.append(index) partial_parses.append(temp_parses) partial_parses_tokens.append( temp_parses_tokens) partial_parses_indices.append( temp_parses_indices) parse_num = parse_num + 1 #if all tokens have been parsed all_tokens_parses = [] x = 0 all_parsed = False for tokensList in partial_parses_tokens: if len(tokensList) == len(self.words): all_tokens_parses.append(partial_parses[x]) all_parsed = True x += 1 if all_parsed: final_parses = [] x = 0 minParses = min(len(parse) for parse in all_tokens_parses) for parse in all_tokens_parses: if len(parse) == minParses: final_parses.append(parse) x += 1 # print("Done early") if len(final_parses) > 1: largest_length = -float('inf') largest_parse = [] for final_parse in final_parses: if final_parse[0].count("(") > largest_length: largest_length = final_parse[0].count("(") for final_parse in final_parses: if final_parse[0].count("(") == largest_length: largest_parse.append(final_parse) return tuple(largest_parse), True return final_parses, True if biggest_parse_this_level: biggest_parse = True if partial_parses == []: # print("There are no possible partial parses for the given string and grammar. Please ensure that there is white space around each token") return (), False #do something to choose which partial parses to return #only consider parses with the most tokens parsed maxTokens = max(len(parse) for parse in partial_parses_tokens) # print("MAX TOKENS", maxTokens) parse_num = 0 pre_final_parses = [] for parse in partial_parses_tokens: if len(parse) == maxTokens: pre_final_parses.append(partial_parses[parse_num]) parse_num += 1 #only consider parses from the previous set that have the minimum islands final_parses = [] parse_num = 0 minParses = min(len(parse) for parse in pre_final_parses) for parse in pre_final_parses: if len(parse) == minParses: final_parses.append(pre_final_parses[parse_num]) parse_num += 1 if len(final_parses) > 1: largest_length = -float('inf') largest_parse = [] for final_parse in final_parses: if final_parse[0].count("(") > largest_length: largest_length = final_parse[0].count("(") for final_parse in final_parses: if final_parse[0].count("(") == largest_length: largest_parse.append(final_parse) return tuple(largest_parse), True return tuple(final_parses), True #only returns strings of length substring_length that have words that are in the grammar's terminals #returns a list of tuples, where each tuple contains the token phrase and a list of indices of each token in the phrase ("phrase", [indices]) def substringsFilterNotInGrammar(self, words, substring_length): terminals = [] for terminal in self.terminals: for splitted_terminal in terminal.split(" "): if "<" not in splitted_terminal: terminals.append(splitted_terminal) # print(terminals) tokens_and_indices = [] i = 0 while i + substring_length <= len(words): temp = words[i:i + substring_length] add = True for word in temp: if word not in terminals: add = False if add: string = " ".join(temp) string = re.sub(r' ([^A-Za-z0-9])', r'\1', string) indices = [] for x in range(i, i + substring_length): indices.append(x) tokens_and_indices.append((string, indices)) i += 1 # print(tokens_and_indices) return tokens_and_indices
class ShiftReduceParser2: def __init__(self, grammar_file): ''' GrammarEngine.grammar returns a Grammar object to get the actual dictionary, do GrammarEngine.grammar.grammar ''' self.grammar = GrammarEngine(grammar_file).grammar.grammar self.verbose = False def parse(self, string, verbose=False): self.verbose = verbose stack = [] remaining = string.split() # if there are tokens remaining to be parsed while len(remaining) > 0: token = remaining[0] remaining = remaining[1:] self.shift(stack, token) self.reduce(stack) # if you've exhausted all tokens but still need to reduce while len(stack) != 1: self.reduce(stack) return stack.pop() def shift(self, stack, token): if self.verbose: print(f"Appending '{token}'") stack.append(token) def reduce(self, stack): ''' rules = { S -> <NP> <VP> NP -> <Det> <Nom> | <PropN> Nom -> <Adj> <Nom> | <N> VP -> <V> <NP> | <V> <S> | <V> <NP> <PP> PP -> <P> <NP> PropN -> Buster | Schiller | Joe Det -> the | a | every N -> bear | squirrel | tree | fish | log Adj -> angry | frightened | little | tall V -> chased | saw | dodged | loved | ghosted | said P -> under | over | near } Progression: ["Buster"] [(PropN "Buster")] [(NP (PropN "Buster"))] [(NP (PropN "Buster")), "ghosted"] [(NP (PropN "Buster")), (V "ghosted")] [(NP (PropN "Buster")), (V "ghosted"), "a"] [(NP (PropN "Buster")), (V "ghosted"), (DET "a")] [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), "little"] [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little")] [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little"), "tree"] [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little"), "tree"] [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little"), (N "tree")] [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (ADJ "little"), (Nom (N "tree"))] [(NP (PropN "Buster")), (V "ghosted"), (DET "a"), (Nom (ADJ "little") (Nom (N "tree")))] [(NP (PropN "Buster")), (V "ghosted"), (NP (DET "a") (Nom (ADJ "little") (Nom (N "tree"))))] [(NP (PropN "Buster")), (VP (V "ghosted") (NP (DET "a") (Nom (ADJ "little") (Nom (N "tree")))))] [(S (NP (PropN "Buster")) (VP (V "ghosted") (NP (DET "a") (Nom (ADJ "little") (Nom (N "tree"))))))] return S Pop an item if it's a string, reduce it to a nonterminal as much as possible, each time creating a new node, subordinating the previous value as its child, push the node back into the stack if it's a nonterminal while stack is not empty pop it off, save it, check if it matches any of the terminal symbols' rules if not, pop another nonterminal symbol, concatenate it to the previous nonterminal in the correct order in the expected format with white space between things, see if this grammar rule matches any of the terminal symbol's rules. if there is a match, create a new node where the child is matching list of nonterminal, push this node back onto the stack self.grammar is a dictionary of nonterminal symbols ''' token = stack.pop() used_token = False # if it's a string, reduce it to a nonterminal if type(token) == str: for nonterminal in self.grammar.keys(): nonterminal_symbol_obj = self.grammar[nonterminal] for rule in nonterminal_symbol_obj.rules: if [token] == rule.body: new_node = Node(symbol=nonterminal_symbol_obj, child=Node(token)) stack.append(new_node) used_token = True if self.verbose: print( f"Appending new node '{new_node.symbol.name}' from terminal symbol '{token}'" ) print("stack looks like:") print(stack) # if the popped token was not used because it was a nonterminal symbol, push it back in if not used_token: if self.verbose: print( f"Because {token.symbol} was not used, appended back on the stack." ) stack.append(token) # if it's a nonterminal, combine nonterminals if necessary to reduce token_list = [] # list containing nonterminal symbol objects token_list_to_be_modified = [] # list containing nodes reduced = False while len(stack) > 0 and not reduced: # if the token_list already contains something, add an white space before adding a new token to account for white space if len(token_list) != 0: token_list.append(' ') node = stack.pop() token_list.append(node.symbol) # [obj(NP), obj(VP)] token_list_to_be_modified.append( node) # [Node(obj(NP)), Node(obj(VP))] # reverse the list because due to the property of "popping" in the opposite order, you need to correct the order token_list.reverse() token_list_to_be_modified.reverse() if self.verbose: print(f"Trying out production rule: {token_list}") # check if there's a matching rule to this token_list for nonterminal in self.grammar.keys(): nonterminal_symbol_obj = self.grammar[nonterminal] for rule in nonterminal_symbol_obj.rules: if token_list == rule.body: # link the nodes that are adjacent to one another for i in range(len(token_list_to_be_modified) - 1): token_list_to_be_modified[ i].next = token_list_to_be_modified[i + 1] new_node = Node(symbol=nonterminal_symbol_obj, child=token_list_to_be_modified[0]) stack.append(new_node) reduced = True if self.verbose: print( f"Appending new node '{new_node.symbol.name}' from nonterminal symbol '{nonterminal_symbol_obj.name}'" ) break # tiny little optimization to stop it from going allll they way down the list if reduced: break # if there was a reduction, clear the intermediate lists, and move on if reduced: if self.verbose: print("Reduced") token_list = [] token_list_to_be_modified = [] # if there wasn't a reduction, and you still have stuff in the stack, pop another item and add it to the token_list elif not reduced and len(stack) > 0: if self.verbose: print("It was not reduced but trying again") print("stack looks like:") print(stack) token_list.reverse( ) # you need to re-reverse it so that when you add a new item it won't be all jacked up token_list_to_be_modified.reverse() # if there was not a reduction, and there are no more things to pop off the stack, else: for node in token_list_to_be_modified: stack.append(node) reduced = True if self.verbose: print( "It was not reduced and there are no more things to pop off the stack, so quitting, while restoring stack to original condition" ) print("stack looks like:") print(stack) def print_tree(self, root): if type(root.symbol) == str: return root.symbol else: string = "" if root.child != None: string += "(" + root.symbol.name + " " + self.print_tree( root.child) + ")" if root.next != None: string += self.print_tree(root.next) return string