def add_new_vocab_rule(self, rule): """ Adds a new vocabulary rule to the set of rules, and recreates self.cfg and self.parser. """ self.rules.append(Production(NT(rule[0]), rule[1])) self.cfg = ContextFreeGrammar(NT("S"), self.rules) self.parser = EarleyChartParser(self.cfg, trace=0)
def find_after_verb(parse): """ Finds the first "After_Verb_*" structure in the parse. """ if isinstance(parse, str): return None tree = parse.productions()[0] if tree.lhs() == NT("After_Verb_Tr") or \ tree.lhs() == NT("After_Verb_In"): return parse else: for subtree in parse: subj = find_after_verb(subtree) if subj: return subj
def find_compound_noun(parse): """ Finds the first compound noun in the parse. """ if isinstance(parse, str): return None tree = parse.productions()[0] if tree.lhs() == NT("CompoundNoun") or \ tree.lhs() == NT("CompoundNoun_Pl"): return parse else: for subtree in parse: c = find_compound_noun(subtree) if c: return c return None
def __init__(self, rules_file="rules.gr", vocab_file="vocabulary.gr"): """ Reads in grammar rules (from rules_file) and vocab rules (from vocab_file) and creates self.cfg (a ContextFreeGrammar) and self.parser (a EarleyChartParser). """ self.rules = [] test_sentences = [] # get the rules from rules_file grammar = open(rules_file, "r") line = grammar.readline() while line: if line.strip() != "" and not line.strip().startswith("#"): line = line[2:] parts = line.partition("\t") lhs = parts[0].strip() rhs = [NT(x) for x in parts[2].strip().split(" ")] self.rules.append(Production(NT(lhs), rhs)) line = grammar.readline() grammar.close() # get the rules from vocab_file vocab = open(vocab_file, "r") line = vocab.readline() while line: if line.strip() != "" and not line.strip().startswith("#"): line = line[2:] parts = line.partition("\t") lhs = parts[0].strip() rhs = parts[2].strip().lower().split(" ") self.rules.append(Production(NT(lhs), rhs)) line = vocab.readline() vocab.close() # create the grammar and parser self.cfg = ContextFreeGrammar(NT("S"), self.rules) self.parser = EarleyChartParser(self.cfg, trace=0)
def find_PP(parse): """ Finds the first prepositional phrase in the parse. """ if isinstance(parse, str): return None tree = parse.productions()[0] if tree.lhs() == NT("PP"): return parse[-1] else: for subtree in parse: pp = find_PP(subtree) if pp: return pp return None
def get_sentence_type(parse): """ Determines the sentence type recursively, based on the rules the tree is built out of. """ if isinstance(parse, str): return 0 lhs = parse.productions()[0].lhs() if lhs == NT("Ind_Clause_Ques") or \ lhs == NT("Ind_Clause_Ques_Aux"): return QUESTION elif lhs == NT("Ind_Clause") or \ lhs == NT("Ind_Clause_Pl"): if parse.productions()[0].rhs()[0] == NT("VP_Inf"): return COMMAND else: return STATEMENT for subtree in parse: type = get_sentence_type(subtree) if type: return type return 0
def parse_NP(self, sen): """ Parses a partial sentence (that is, usually a noun phrase. Returns the parse, or returns a tuple. """ try: cfg_temp = ContextFreeGrammar(NT("NP"), self.rules) parser_temp = EarleyChartParser(cfg_temp, trace=0) parse = parser_temp.nbest_parse(sen.strip().split(" "), trace=0) except: print traceback.format_exc() else: if parse: return parse[0] print "failure" return None
def rand_sent(self): """ Creates a random sentence from self.cfg. """ poss = self.cfg.productions(lhs=NT("S")) if len(poss) > 1: index = random.randint(0, len(poss) - 1) elif len(poss) == 1: index = 0 else: print left return None sen = [] print poss[index] for nt in poss[index].rhs(): if isinstance(nt, NT): sen.append(rand_sent(nt)) else: sen.append(nt) return " ".join(sen)
def find_noun(parse, exceptions=[]): """ Finds the first noun in the parse. """ if isinstance(parse, str): return None tree = parse.productions()[0] if (tree.lhs() == NT("NP") or \ tree.lhs() == NT("NP_1st") or \ tree.lhs() == NT("NP_2nd") or \ tree.lhs() == NT("NP_3rd") or \ tree.lhs() == NT("NP_1st_Pl") or \ tree.lhs() == NT("NP_3rd_Pl") or \ tree.lhs() == NT("NP_Obj") or \ tree.lhs() == NT("Name") or \ tree.lhs() == NT("Place") or \ tree.lhs() == NT("Program") or \ tree.lhs() == NT("Org") or \ tree.lhs() == NT("Field") or \ tree.lhs() == NT("Nominal") or \ tree.lhs() == NT("Command") or \ tree.lhs() == NT("File_Addr") or \ tree.lhs() == NT("Web_Addr") or \ tree.lhs() == NT("CompoundNoun") or \ tree.lhs() == NT("Noun") or \ tree.lhs() == NT("Noun_Pl") or \ tree.lhs() == NT("Nominal") or \ tree.lhs() == NT("Nominal_Pl")) and \ " ".join(parse.leaves()) not in exceptions: return parse else: for subtree in parse: n = find_noun(subtree, exceptions) if n: return n return None
def find_topic(parse, type=None, qword=None): """ Finds the topic of a sentence, based on the sentence type: either QUESTION, STATEMENT, or COMMAND. """ # find the sentence type if it's not specified if type == None: type = get_sentence_type(parse) if isinstance(parse, str): return None tree = parse.productions()[0] print type, "- tree:", tree # for questions if type == QUESTION: # All questions start with the "Ind_Clause_Ques*" structure. # After that, there are several possibilities: # - VP_3rd # - Ind_Clause_Ques_Aux # - Interrog_Clause # - Ind_Clause_Inf* # Depending on which one of these comes next, keep searching # for the topic. if tree.lhs() == NT("Ind_Clause_Ques") or \ tree.lhs() == NT("Ind_Clause_Ques_Aux"): if not qword: qword = parse[0].leaves()[0] print "qword:", qword rhs = tree.rhs() if rhs[-1] == NT("VP_3rd"): print "VP_3rd" #return parse[-1][-1], qword t = find_after_verb(parse[-1][-1]) if not t: t = find_PP(parse[-1][-1]) return t, qword # this acts just like a statement, so call find_topic # again, but specifying the type=STATEMENT elif rhs[-1] == NT("Ind_Clause_Ques_Aux"): print "Ind_Clause_Ques_Aux" return find_topic(parse[-1][-1], type=STATEMENT), qword elif rhs[-1] == NT("Interrog_Clause"): print "Interrog_Clause" t = find_after_verb(parse[-1][-1]) if not t: t = find_PP(parse[-1][-1]) return t, qword # this acts just like a statement, so call find_topic # again, but specifying the type=STATEMENT elif rhs[-1] == NT("Ind_Clause_Inf") or \ rhs[-1] == NT("Ind_Clause_Inf_3rd"): print "Ind_Clause_Inf" return find_topic(parse[-1], type=STATEMENT), qword else: for subtree in parse: subj = find_topic(subtree, type) if subj: return subj # for statements elif type == STATEMENT: if tree.lhs() == NT("VP_1st") or \ tree.lhs() == NT("VP_Inf"): t = find_after_verb(parse[-1][-1]) if not t: t = find_PP(parse[-1][-1]) return t else: for subtree in parse: subj = find_topic(subtree, type) if subj: return subj # for commands elif type == COMMAND: if tree.lhs() == NT("VP_Inf"): rhs = tree.rhs() if rhs[-1] == NT("PP"): return parse[-1] # elif \ # rhs[-1] == NT("After_Verb_Tr") or \ # rhs[-1] == NT("After_Verb_In") or \ # rhs[-1] == NT("V_Inf_In_Neg") or \ # rhs[-1] == NT("VP_Inf") or \ # rhs[-1] == NT("NP_Obj"): else: return find_after_verb(parse) elif tree.lhs() == NT("PP"): return parse[-1] else: for subtree in parse: subj = find_topic(subtree, type) if subj: return subj return None