def __init__(self, comparative_mgr, det_pronoun_mgr, personal_mgr, plural_mgr, pro_adverb_mgr, say_state, time_of_day_mgr, verb_mgr): # For extracting the correct verb conjugation from subjects. self.arbitrary_idiolect = Idiolect() self.say_state = say_state self.subject_say_context = SayContext(prep=None, has_left=True, has_right=True, is_possessive=False) self.end_punct_clf = EndPunctClassifier() self.verb_extractor = VerbExtractor(verb_mgr) self.comparative_mgr = comparative_mgr self.det_pronoun_mgr = det_pronoun_mgr self.personal_mgr = personal_mgr self.plural_mgr = plural_mgr self.pro_adverb_mgr = pro_adverb_mgr self.time_of_day_mgr = time_of_day_mgr self.tag2recognize_arg = { 'DT': self.recog_dt, 'EX': self.recog_ex, 'JJ': self.recog_jj, 'JJR': self.recog_jjr, 'NN': self.recog_nn, 'NNS': self.recog_nns, 'NNP': self.recog_nnp, 'PRP': self.recog_prp, 'WP': self.recog_wp, } self.tag2recognize_prep_arg = { 'RB': self.recog_rb, 'WRB': self.recog_wrb, 'RBR': self.recog_rbr, } self.invalid_verb_arg_root_tags = set([ '.', 'VB', ]) self.directions = """ north south east west left right """.split()
def __init__(self, comparative_mgr, det_pronoun_mgr, personal_mgr, plural_mgr, pro_adverb_mgr, say_state, time_of_day_mgr, verb_mgr): # For extracting the correct verb conjugation from subjects. self.arbitrary_idiolect = Idiolect() self.say_state = say_state self.subject_say_context = SayContext( prep=None, has_left=True, has_right=True, is_possessive=False) self.end_punct_clf = EndPunctClassifier() self.verb_extractor = VerbExtractor(verb_mgr) self.comparative_mgr = comparative_mgr self.det_pronoun_mgr = det_pronoun_mgr self.personal_mgr = personal_mgr self.plural_mgr = plural_mgr self.pro_adverb_mgr = pro_adverb_mgr self.time_of_day_mgr = time_of_day_mgr self.tag2recognize_arg = { 'DT': self.recog_dt, 'EX': self.recog_ex, 'JJ': self.recog_jj, 'JJR': self.recog_jjr, 'NN': self.recog_nn, 'NNS': self.recog_nns, 'NNP': self.recog_nnp, 'PRP': self.recog_prp, 'WP': self.recog_wp, } self.tag2recognize_prep_arg = { 'RB': self.recog_rb, 'WRB': self.recog_wrb, 'RBR': self.recog_rbr, } self.invalid_verb_arg_root_tags = set([ '.', 'VB', ]) self.directions = """ north south east west left right """.split()
class ParseToSurface(object): """ Object that converts parses to surface structure. """ def __init__(self, comparative_mgr, det_pronoun_mgr, personal_mgr, plural_mgr, pro_adverb_mgr, say_state, time_of_day_mgr, verb_mgr): # For extracting the correct verb conjugation from subjects. self.arbitrary_idiolect = Idiolect() self.say_state = say_state self.subject_say_context = SayContext( prep=None, has_left=True, has_right=True, is_possessive=False) self.end_punct_clf = EndPunctClassifier() self.verb_extractor = VerbExtractor(verb_mgr) self.comparative_mgr = comparative_mgr self.det_pronoun_mgr = det_pronoun_mgr self.personal_mgr = personal_mgr self.plural_mgr = plural_mgr self.pro_adverb_mgr = pro_adverb_mgr self.time_of_day_mgr = time_of_day_mgr self.tag2recognize_arg = { 'DT': self.recog_dt, 'EX': self.recog_ex, 'JJ': self.recog_jj, 'JJR': self.recog_jjr, 'NN': self.recog_nn, 'NNS': self.recog_nns, 'NNP': self.recog_nnp, 'PRP': self.recog_prp, 'WP': self.recog_wp, } self.tag2recognize_prep_arg = { 'RB': self.recog_rb, 'WRB': self.recog_wrb, 'RBR': self.recog_rbr, } self.invalid_verb_arg_root_tags = set([ '.', 'VB', ]) self.directions = """ north south east west left right """.split() def recog_dt(self, root_token): nn = [] for selector in self.det_pronoun_mgr.parse_pronoun(root_token.text): n = SurfaceCommonNoun(selector=selector) nn.append(n) return map(lambda n: (None, n), nn) def recog_ex(self, root_token): p_n = (None, ExistentialThere()) return [p_n] def recog_jj(self, root_token): return [(None, Adjective(root_token.text))] def recog_jjr(self, root_token): if len(root_token.downs) != 1: return [] rel, child = root_token.downs[0] if rel != 'prep': return [] if child.text != 'than': return [] # Eg, "what is the castle [bigger than]?" if not child.downs: rr = [] for degree, polarity, base in \ self.comparative_mgr.decode(None, root_token.text): n = SurfaceComparative(polarity, base, None) rr.append((None, n)) return rr rel, child = child.downs[0] if rel != 'pobj': return [] r = self.recognize_verb_arg(child) if r is None: return [] pp_nn = r pp_nn = filter(lambda (p, n): not p, pp_nn) thans = map(lambda (p, n): n, pp_nn) rr = [] for degree, polarity, base in \ self.comparative_mgr.decode(None, root_token.text): for than in thans: n = SurfaceComparative(polarity, base, than) rr.append((None, n)) return rr def recog_time_of_day(self, root_token): if not root_token.downs: pre = None elif len(root_token.downs) == 1: rel, child = root_token.downs[0] pre = child.text else: return [] rr = self.time_of_day_mgr.decode(pre, root_token.text) return map(lambda r: ((TIME_PREP,), TimeOfDay(*r)), rr) def recog_how_many_nn_head(self, root_token, noun, n2): many = None for rel, child in root_token.downs: if rel == 'amod' and child.text in ('few', 'many'): many = child break if not many: return [] how = None for rel, child in many.downs: if rel == 'advmod' and child.text == 'how': how = child number = Number(None) correlative = Correlative.INDEF count_restriction = self.det_pronoun_mgr.cor2res_gno[correlative][0] selector = Selector.from_correlative(correlative, count_restriction) assert selector n = SurfaceCommonNoun(selector=selector, number=number, noun=noun) return [(None, n)] def recog_dt_nn_head(self, root_token, noun, gram_n2): """ * (ADJS) NN(S) "fat mice" * DT (ADJS) NN(S) "the fat mice" """ downs = filter(lambda (rel, child): rel not in ('cc', 'conj', 'prep'), root_token.downs) if downs: dep, child = downs[0] if dep != 'det': return [] s = child.text if s == 'a' or s == 'an': s = A_OR_AN maybe_selectors = self.det_pronoun_mgr.parse_determiner(s) selectors = [] for maybe_selector in maybe_selectors: for sel in maybe_selector.restricted_to_grammatical_number( gram_n2, self.det_pronoun_mgr.cor2res_gno): selectors.append(sel) else: if gram_n2 == N2.SING: return [] selector = Selector( Correlative.INDEF, N5.DUAL, N5.MANY, N5.DUAL, N5.MANY) selectors = [selector] attrs = [] for dep, child in downs[1:]: if dep != 'amod': return [] attrs.append(child.text) nn = [] for selector in selectors: n = SurfaceCommonNoun(selector=selector, attributes=list(attrs), noun=noun) nn.append(n) return map(lambda n: (None, n), nn) def recog_posdet_nn_head(self, root_token, noun, gram_n2): """ * PRP$ (ADJS) NN(S) * WP$ (ADJS) NN(S) """ downs = filter(lambda (rel, child): rel not in ('cc', 'conj', 'prep'), root_token.downs) if not downs: return [] rel, possessor = downs[0] if rel != 'pos': return [] attrs = [] for rel, child in downs[1:]: if rel != 'amod': return [] attrs.append(child.text) nn = [] for declension in self.personal_mgr.posdet_parse((possessor.text,)): pos = PersonalPronoun(declension, PersonalPronounCase.OBJECT) correlative = Correlative.DEF count_restriction = self.det_pronoun_mgr.cor2res_gno[correlative][0] selector = Selector.from_correlative(correlative, count_restriction) if not selector: continue for selector in selector.restricted_to_grammatical_number( gram_n2, self.det_pronoun_mgr.cor2res_gno): n = SurfaceCommonNoun(possessor=pos, selector=selector, attributes=list(attrs), noun=noun) nn.append(n) return map(lambda n: (None, n), nn) def recog_shortcut_head(self, root_token): pp_nn = [] ss = root_token.text, for prep, selector, noun in self.pro_adverb_mgr.parse(ss): n = SurfaceCommonNoun(selector=selector, noun=noun) pp_nn.append((prep, n)) return pp_nn def recog_common_noun_head(self, root_token, noun, n2): pp_nn = [] pp_nn += self.recog_how_many_nn_head(root_token, noun, n2) pp_nn += self.recog_dt_nn_head(root_token, noun, n2) pp_nn += self.recog_posdet_nn_head(root_token, noun, n2) pp_nn += self.recog_shortcut_head(root_token) return pp_nn def recog_common_noun_tail(self, root_token): preps_optionss = [] for rel, child in root_token.downs: if rel != 'prep': continue if len(child.downs) != 1: continue rel, grandchild = child.downs[0] if rel != 'pobj': continue r = self.recognize_verb_arg(grandchild) if r is None: return None pp_nn = r pp_nn = filter(lambda (p, n): not p, pp_nn) nn = map(lambda (p, n): n, pp_nn) prep = child.text, preps_optionss.append((prep, nn)) return preps_optionss def recog_common_noun(self, root_token, noun, n2): head_pp_nn = self.recog_common_noun_head(root_token, noun, n2) preps_optionss = self.recog_common_noun_tail(root_token) if not preps_optionss: return head_pp_nn rr = [] tail_preps, tail_optionss = zip(*preps_optionss) for head_p, head_n in head_pp_nn: for tail_options in product(*tail_optionss): n = deepcopy(head_n) n.preps_nargs = zip(tail_preps, tail_options) rr.append((head_p, n)) return rr def recog_direction(self, root_token): if root_token.text not in self.directions: return [] z = len(root_token.downs) if z == 0: return [] elif z == 1: (prep, of), = root_token.downs elif z == 2: (det, dt), (prep, of) = root_token.downs if det != 'det': return [] if dt.tag != 'DT': return [] else: return [] if prep != 'prep': return [] if of.text != 'of': return [] # Eg, "what is the castle [east of _]?" if not of.downs: n = SurfaceDirection(root_token.text, None) return [(None, n)] dep, child = of.downs[0] if dep != 'pobj': return [] r = self.recognize_verb_arg(child) if r is None: return None pp_nn = r pp_nn = filter(lambda (p, n): not p, pp_nn) ofs = map(lambda (p, n): n, pp_nn) pp_nn = [] for of in ofs: d = SurfaceDirection(root_token.text, of) pp_nn.append((None, d)) return pp_nn def recog_nn(self, root_token): """ Eg, cat. """ rr = self.recog_time_of_day(root_token) if rr: return rr rr = self.recog_direction(root_token) if rr: return rr sing = root_token.text rr = self.recog_common_noun(root_token, sing, N2.SING) return rr def recog_nns(self, root_token): """ Eg, cats. """ rr = [] for sing in self.plural_mgr.to_singular(root_token.text): for r in self.recog_common_noun(root_token, sing, N2.PLUR): rr.append(r) return rr def recog_nnp(self, root_token): """ Eg, James. """ name = root_token.text, n = ProperNoun(name=name, is_plur=False) return [(None, n)] def recog_prp(self, root_token): """ Eg, you. """ ss = root_token.text, nn = self.personal_mgr.perspro_parse(ss) return map(lambda n: (None, n), nn) def recog_wp(self, root_token): """ Eg, who. """ nn = [] # For WP like "what". for selector in self.det_pronoun_mgr.parse_pronoun(root_token.text): n = SurfaceCommonNoun(selector=selector) nn.append(n) # For WP like "who". ss = root_token.text, nn += self.personal_mgr.perspro_parse(ss) return map(lambda n: (None, n), nn) def recog_adverb(self, root_token): """ We don't normally bother with adverbs, but do recognize pro-adverbs as actually being (prep, argument) pairs encoded into a single word. """ return self.recog_shortcut_head(root_token) def recog_rb(self, root_token): """ Eg, here. Returns None on failure (regular adverb, which we don't accept as args yet). """ rr = self.recog_adverb(root_token) if rr: return rr return None def recog_wrb(self, root_token): """ Eg, where. """ return self.recog_adverb(root_token) def recog_rbr(self, root_token): """ For bAbi, as a verb argument, just present in invalid parses? """ return None def recognize_verb_arg(self, root_token): """ Token -> None or list of (prep or None, SurfaceArgument) Returns None if we reject (do not recognize as an arg) the arg. """ while True: f = self.tag2recognize_arg.get(root_token.tag) if f: rr = f(root_token) break f = self.tag2recognize_prep_arg.get(root_token.tag) if f: rr = f(root_token) break if root_token.tag in self.invalid_verb_arg_root_tags: rr = [] break print 'Unknown tag:', root_token.tag assert False if not rr: return rr op = None for rel, child in root_token.downs: if rel != 'cc': continue op = STR2CONJUNCTION.get(child.text) break if not op: return rr for rel, child in root_token.downs: if rel == 'conj': # TODO: generalize this later. other_pp_nn = self.recognize_verb_arg(child) if not other_pp_nn: return other_pp_nn rr2 = [] for prep, n in rr: for other_prep, other_n in other_pp_nn: r2 = prep, SurfaceConjunction(op, [n, other_n]) rr2.append(r2) return rr2 return rr def find_subject(self, verb_span_pair, varg_root_indexes): """ args -> (subj arg index, vmain index) or None if impossible """ a, b = verb_span_pair if a and b: # Both spans ("would you see?"): find the argument that goes between # the spans. for i, arg in enumerate(varg_root_indexes): if a[1] < arg < b[0]: return i, i + 1 assert False elif a: # We must have args. Otherwise, it's not possible. if not varg_root_indexes: return None # Just the pre span ("would you?"): find the argument directly # after the verb words. for i, arg in enumerate(varg_root_indexes): if a[1] < arg: return i, i + 1 # If there are no args afterward, it isn't possible. return None elif b: # Just the main span ("you would", "go!"). # If no args, we're done. if not varg_root_indexes: return None, 0 # Find the argument preceding the one directly after the verb words, # or subject index = None if no subject (imperative). for i, arg in enumerate(varg_root_indexes): if b[1] < arg: if 0 <= i - 1: return i - 1, i else: return None, i # Didn't find any argument after the verb, so the last one is the # subject. TODO: "because of that, go!" -> because is not # imperative subject. n = len(varg_root_indexes) - 1 return n, n + 1 else: assert False def extract_verb_args(self, root_token, verb_span_pair): """ (root token, verb span pair) -> args or None if impossible Where args are (subj arg index, vmain idx, options per arg), where an option is a (prep, verb arg). """ varg_root_indexes = [] ppp_nnn = [] adverbs = [] for rel, t in root_token.downs: if rel not in ('nsubj', 'nsubjpass', 'agent', 'dobj', 'dative', 'expl', 'attr', 'advmod', 'prep', 'compound', 'npadvmod', 'acomp'): continue if rel == 'advmod': for rel, down in t.downs: if down.text == 'no': adverbs.append(down.text) if t.text == 'longer': adverbs.append(t.text) continue if rel == 'agent': prep = 'by', if not t.downs: pp_nn = [(prep, None)] ppp_nnn.append(pp_nn) varg_root_indexes.append(t.index) continue assert len(t.downs) == 1 rel, down = t.downs[0] assert rel == 'pobj' t = down elif t.tag == 'IN': if t.downs: # Skip the conjunctions. downs = filter(lambda (rel, child): rel == 'pobj', t.downs) if len(downs) != 1: continue prep = t.text, t = downs[0][1] else: p = t.text, pp_nn = [(p, None)] ppp_nnn.append(pp_nn) varg_root_indexes.append(t.index) continue else: prep = None r = self.recognize_verb_arg(t) if r is None: continue pp_nn = r """ if rel == 'npadvmod': for i, (p, n) in enumerate(pp_nn): if not p: pp_nn[i] = (TIME_PREP,), n """ spoken_preps = [prep] * len(pp_nn) absorbed_preps, vargs = zip(*pp_nn) if pp_nn else ([], []) pp_nn = [] for spoken_prep, absorbed_prep, varg in \ zip(spoken_preps, absorbed_preps, vargs): if spoken_prep: prep = spoken_prep elif absorbed_prep: prep = absorbed_prep else: prep = None pp_nn.append((prep, varg)) ppp_nnn.append(pp_nn) varg_root_indexes.append(t.index) r = self.find_subject(verb_span_pair, varg_root_indexes) if r is None: return None subj_argx, vmain_index = r return subj_argx, vmain_index, ppp_nnn, adverbs def conjs_from_verb(self, v): """ recognized verb with wildcards -> possible conjugations """ # If the verb's field is a wildcard, it could be any of them. if v.conj == None: v_conjs = Conjugation.values else: v_conjs = set([v.conj]) # If imperative, it just be conjugated second person. if v.intrinsics.modality.flavor == ModalFlavor.IMPERATIVE: v_conjs &= set([Conjugation.S2, Conjugation.P2]) return v_conjs def conjs_from_verb_args(self, pp_nn, subj_argx): """ verb arguments -> possible conjugations """ # It's possible to have no subject, in the case of imperatives. In that # case, choose second person. if subj_argx is None: return set([Conjugation.S2, Conjugation.P2]) # Get the required conjugation from the subject. subj_n = pp_nn[subj_argx][1] conj = subj_n.decide_conjugation( self.say_state, self.arbitrary_idiolect, self.subject_say_context) # In case of existential there, get conjugation from the object instead. if not conj: x = subj_argx + 1 if not (0 <= x < len(pp_nn)): return set([]) # Ex-there but no object = can't parse it. conj = pp_nn[x][1].decide_conjugation( self.say_state, self.arbitrary_idiolect, self.subject_say_context) return set([conj]) def possible_conjugations(self, v, pp_nn, subj_argx): """ verb and arguments -> possible conjugations Verb agreement. """ v_conjs = self.conjs_from_verb(v) n_conjs = self.conjs_from_verb_args(pp_nn, subj_argx) return v_conjs & n_conjs def recognize_clause(self, root_token, is_root_clause): """ root token -> yields SurfaceContentClause """ cc = [] for verb_span_pair, vv in \ self.verb_extractor.extract(root_token, is_root_clause): # Hack to compensate for a bug in imperative verb saying. if verb_span_pair[0] and not verb_span_pair[1]: vv = filter(lambda v: not v.is_imperative(), vv) if not vv: continue r = self.extract_verb_args(root_token, verb_span_pair) if r is None: continue subj_argx, vmain_index, ppp_nnn, adverbs = r if subj_argx is None: vv = filter(lambda v: v.is_imperative(), vv) else: assert 0 <= subj_argx < len(ppp_nnn) for v in vv: for pp_nn in product(*ppp_nnn): pp_nn = list(pp_nn) for conj in self.possible_conjugations(v, pp_nn, subj_argx): complementizer = Complementizer.ZERO new_v = deepcopy(v) new_v.conj = conj c = SurfaceContentClause( complementizer, new_v, adverbs, deepcopy(pp_nn), vmain_index) cc.append(c) return cc def recog(self, parse): """ Parse -> yields SurfaceSentence """ assert isinstance(parse, Parse) assert parse.tokens end_punct = self.end_punct_clf.classify(parse.tokens[-1].text) for clause in self.recognize_clause(parse.root, is_root_clause=True): if '?' in end_punct and clause.verb.is_imperative(): continue yield SurfaceSentence(clause, end_punct)
class ParseToSurface(object): """ Object that converts parses to surface structure. """ def __init__(self, comparative_mgr, det_pronoun_mgr, personal_mgr, plural_mgr, pro_adverb_mgr, say_state, time_of_day_mgr, verb_mgr): # For extracting the correct verb conjugation from subjects. self.arbitrary_idiolect = Idiolect() self.say_state = say_state self.subject_say_context = SayContext(prep=None, has_left=True, has_right=True, is_possessive=False) self.end_punct_clf = EndPunctClassifier() self.verb_extractor = VerbExtractor(verb_mgr) self.comparative_mgr = comparative_mgr self.det_pronoun_mgr = det_pronoun_mgr self.personal_mgr = personal_mgr self.plural_mgr = plural_mgr self.pro_adverb_mgr = pro_adverb_mgr self.time_of_day_mgr = time_of_day_mgr self.tag2recognize_arg = { 'DT': self.recog_dt, 'EX': self.recog_ex, 'JJ': self.recog_jj, 'JJR': self.recog_jjr, 'NN': self.recog_nn, 'NNS': self.recog_nns, 'NNP': self.recog_nnp, 'PRP': self.recog_prp, 'WP': self.recog_wp, } self.tag2recognize_prep_arg = { 'RB': self.recog_rb, 'WRB': self.recog_wrb, 'RBR': self.recog_rbr, } self.invalid_verb_arg_root_tags = set([ '.', 'VB', ]) self.directions = """ north south east west left right """.split() def recog_dt(self, root_token): nn = [] for selector in self.det_pronoun_mgr.parse_pronoun(root_token.text): n = SurfaceCommonNoun(selector=selector) nn.append(n) return map(lambda n: (None, n), nn) def recog_ex(self, root_token): p_n = (None, ExistentialThere()) return [p_n] def recog_jj(self, root_token): return [(None, Adjective(root_token.text))] def recog_jjr(self, root_token): if len(root_token.downs) != 1: return [] rel, child = root_token.downs[0] if rel != 'prep': return [] if child.text != 'than': return [] # Eg, "what is the castle [bigger than]?" if not child.downs: rr = [] for degree, polarity, base in \ self.comparative_mgr.decode(None, root_token.text): n = SurfaceComparative(polarity, base, None) rr.append((None, n)) return rr rel, child = child.downs[0] if rel != 'pobj': return [] r = self.recognize_verb_arg(child) if r is None: return [] pp_nn = r pp_nn = filter(lambda (p, n): not p, pp_nn) thans = map(lambda (p, n): n, pp_nn) rr = [] for degree, polarity, base in \ self.comparative_mgr.decode(None, root_token.text): for than in thans: n = SurfaceComparative(polarity, base, than) rr.append((None, n)) return rr def recog_time_of_day(self, root_token): if not root_token.downs: pre = None elif len(root_token.downs) == 1: rel, child = root_token.downs[0] pre = child.text else: return [] rr = self.time_of_day_mgr.decode(pre, root_token.text) return map(lambda r: ((TIME_PREP, ), TimeOfDay(*r)), rr) def recog_how_many_nn_head(self, root_token, noun, n2): many = None for rel, child in root_token.downs: if rel == 'amod' and child.text in ('few', 'many'): many = child break if not many: return [] how = None for rel, child in many.downs: if rel == 'advmod' and child.text == 'how': how = child number = Number(None) correlative = Correlative.INDEF count_restriction = self.det_pronoun_mgr.cor2res_gno[correlative][0] selector = Selector.from_correlative(correlative, count_restriction) assert selector n = SurfaceCommonNoun(selector=selector, number=number, noun=noun) return [(None, n)] def recog_dt_nn_head(self, root_token, noun, gram_n2): """ * (ADJS) NN(S) "fat mice" * DT (ADJS) NN(S) "the fat mice" """ downs = filter(lambda (rel, child): rel not in ('cc', 'conj', 'prep'), root_token.downs) if downs: dep, child = downs[0] if dep != 'det': return [] s = child.text if s == 'a' or s == 'an': s = A_OR_AN maybe_selectors = self.det_pronoun_mgr.parse_determiner(s) selectors = [] for maybe_selector in maybe_selectors: for sel in maybe_selector.restricted_to_grammatical_number( gram_n2, self.det_pronoun_mgr.cor2res_gno): selectors.append(sel) else: if gram_n2 == N2.SING: return [] selector = Selector(Correlative.INDEF, N5.DUAL, N5.MANY, N5.DUAL, N5.MANY) selectors = [selector] attrs = [] for dep, child in downs[1:]: if dep != 'amod': return [] attrs.append(child.text) nn = [] for selector in selectors: n = SurfaceCommonNoun(selector=selector, attributes=list(attrs), noun=noun) nn.append(n) return map(lambda n: (None, n), nn) def recog_posdet_nn_head(self, root_token, noun, gram_n2): """ * PRP$ (ADJS) NN(S) * WP$ (ADJS) NN(S) """ downs = filter(lambda (rel, child): rel not in ('cc', 'conj', 'prep'), root_token.downs) if not downs: return [] rel, possessor = downs[0] if rel != 'pos': return [] attrs = [] for rel, child in downs[1:]: if rel != 'amod': return [] attrs.append(child.text) nn = [] for declension in self.personal_mgr.posdet_parse((possessor.text, )): pos = PersonalPronoun(declension, PersonalPronounCase.OBJECT) correlative = Correlative.DEF count_restriction = self.det_pronoun_mgr.cor2res_gno[correlative][ 0] selector = Selector.from_correlative(correlative, count_restriction) if not selector: continue for selector in selector.restricted_to_grammatical_number( gram_n2, self.det_pronoun_mgr.cor2res_gno): n = SurfaceCommonNoun(possessor=pos, selector=selector, attributes=list(attrs), noun=noun) nn.append(n) return map(lambda n: (None, n), nn) def recog_shortcut_head(self, root_token): pp_nn = [] ss = root_token.text, for prep, selector, noun in self.pro_adverb_mgr.parse(ss): n = SurfaceCommonNoun(selector=selector, noun=noun) pp_nn.append((prep, n)) return pp_nn def recog_common_noun_head(self, root_token, noun, n2): pp_nn = [] pp_nn += self.recog_how_many_nn_head(root_token, noun, n2) pp_nn += self.recog_dt_nn_head(root_token, noun, n2) pp_nn += self.recog_posdet_nn_head(root_token, noun, n2) pp_nn += self.recog_shortcut_head(root_token) return pp_nn def recog_common_noun_tail(self, root_token): preps_optionss = [] for rel, child in root_token.downs: if rel != 'prep': continue if len(child.downs) != 1: continue rel, grandchild = child.downs[0] if rel != 'pobj': continue r = self.recognize_verb_arg(grandchild) if r is None: return None pp_nn = r pp_nn = filter(lambda (p, n): not p, pp_nn) nn = map(lambda (p, n): n, pp_nn) prep = child.text, preps_optionss.append((prep, nn)) return preps_optionss def recog_common_noun(self, root_token, noun, n2): head_pp_nn = self.recog_common_noun_head(root_token, noun, n2) preps_optionss = self.recog_common_noun_tail(root_token) if not preps_optionss: return head_pp_nn rr = [] tail_preps, tail_optionss = zip(*preps_optionss) for head_p, head_n in head_pp_nn: for tail_options in product(*tail_optionss): n = deepcopy(head_n) n.preps_nargs = zip(tail_preps, tail_options) rr.append((head_p, n)) return rr def recog_direction(self, root_token): if root_token.text not in self.directions: return [] z = len(root_token.downs) if z == 0: return [] elif z == 1: (prep, of), = root_token.downs elif z == 2: (det, dt), (prep, of) = root_token.downs if det != 'det': return [] if dt.tag != 'DT': return [] else: return [] if prep != 'prep': return [] if of.text != 'of': return [] # Eg, "what is the castle [east of _]?" if not of.downs: n = SurfaceDirection(root_token.text, None) return [(None, n)] dep, child = of.downs[0] if dep != 'pobj': return [] r = self.recognize_verb_arg(child) if r is None: return None pp_nn = r pp_nn = filter(lambda (p, n): not p, pp_nn) ofs = map(lambda (p, n): n, pp_nn) pp_nn = [] for of in ofs: d = SurfaceDirection(root_token.text, of) pp_nn.append((None, d)) return pp_nn def recog_nn(self, root_token): """ Eg, cat. """ rr = self.recog_time_of_day(root_token) if rr: return rr rr = self.recog_direction(root_token) if rr: return rr sing = root_token.text rr = self.recog_common_noun(root_token, sing, N2.SING) return rr def recog_nns(self, root_token): """ Eg, cats. """ rr = [] for sing in self.plural_mgr.to_singular(root_token.text): for r in self.recog_common_noun(root_token, sing, N2.PLUR): rr.append(r) return rr def recog_nnp(self, root_token): """ Eg, James. """ name = root_token.text, n = ProperNoun(name=name, is_plur=False) return [(None, n)] def recog_prp(self, root_token): """ Eg, you. """ ss = root_token.text, nn = self.personal_mgr.perspro_parse(ss) return map(lambda n: (None, n), nn) def recog_wp(self, root_token): """ Eg, who. """ nn = [] # For WP like "what". for selector in self.det_pronoun_mgr.parse_pronoun(root_token.text): n = SurfaceCommonNoun(selector=selector) nn.append(n) # For WP like "who". ss = root_token.text, nn += self.personal_mgr.perspro_parse(ss) return map(lambda n: (None, n), nn) def recog_adverb(self, root_token): """ We don't normally bother with adverbs, but do recognize pro-adverbs as actually being (prep, argument) pairs encoded into a single word. """ return self.recog_shortcut_head(root_token) def recog_rb(self, root_token): """ Eg, here. Returns None on failure (regular adverb, which we don't accept as args yet). """ rr = self.recog_adverb(root_token) if rr: return rr return None def recog_wrb(self, root_token): """ Eg, where. """ return self.recog_adverb(root_token) def recog_rbr(self, root_token): """ For bAbi, as a verb argument, just present in invalid parses? """ return None def recognize_verb_arg(self, root_token): """ Token -> None or list of (prep or None, SurfaceArgument) Returns None if we reject (do not recognize as an arg) the arg. """ while True: f = self.tag2recognize_arg.get(root_token.tag) if f: rr = f(root_token) break f = self.tag2recognize_prep_arg.get(root_token.tag) if f: rr = f(root_token) break if root_token.tag in self.invalid_verb_arg_root_tags: rr = [] break print 'Unknown tag:', root_token.tag assert False if not rr: return rr op = None for rel, child in root_token.downs: if rel != 'cc': continue op = STR2CONJUNCTION.get(child.text) break if not op: return rr for rel, child in root_token.downs: if rel == 'conj': # TODO: generalize this later. other_pp_nn = self.recognize_verb_arg(child) if not other_pp_nn: return other_pp_nn rr2 = [] for prep, n in rr: for other_prep, other_n in other_pp_nn: r2 = prep, SurfaceConjunction(op, [n, other_n]) rr2.append(r2) return rr2 return rr def find_subject(self, verb_span_pair, varg_root_indexes): """ args -> (subj arg index, vmain index) or None if impossible """ a, b = verb_span_pair if a and b: # Both spans ("would you see?"): find the argument that goes between # the spans. for i, arg in enumerate(varg_root_indexes): if a[1] < arg < b[0]: return i, i + 1 assert False elif a: # We must have args. Otherwise, it's not possible. if not varg_root_indexes: return None # Just the pre span ("would you?"): find the argument directly # after the verb words. for i, arg in enumerate(varg_root_indexes): if a[1] < arg: return i, i + 1 # If there are no args afterward, it isn't possible. return None elif b: # Just the main span ("you would", "go!"). # If no args, we're done. if not varg_root_indexes: return None, 0 # Find the argument preceding the one directly after the verb words, # or subject index = None if no subject (imperative). for i, arg in enumerate(varg_root_indexes): if b[1] < arg: if 0 <= i - 1: return i - 1, i else: return None, i # Didn't find any argument after the verb, so the last one is the # subject. TODO: "because of that, go!" -> because is not # imperative subject. n = len(varg_root_indexes) - 1 return n, n + 1 else: assert False def extract_verb_args(self, root_token, verb_span_pair): """ (root token, verb span pair) -> args or None if impossible Where args are (subj arg index, vmain idx, options per arg), where an option is a (prep, verb arg). """ varg_root_indexes = [] ppp_nnn = [] adverbs = [] for rel, t in root_token.downs: if rel not in ('nsubj', 'nsubjpass', 'agent', 'dobj', 'dative', 'expl', 'attr', 'advmod', 'prep', 'compound', 'npadvmod', 'acomp'): continue if rel == 'advmod': for rel, down in t.downs: if down.text == 'no': adverbs.append(down.text) if t.text == 'longer': adverbs.append(t.text) continue if rel == 'agent': prep = 'by', if not t.downs: pp_nn = [(prep, None)] ppp_nnn.append(pp_nn) varg_root_indexes.append(t.index) continue assert len(t.downs) == 1 rel, down = t.downs[0] assert rel == 'pobj' t = down elif t.tag == 'IN': if t.downs: # Skip the conjunctions. downs = filter(lambda (rel, child): rel == 'pobj', t.downs) if len(downs) != 1: continue prep = t.text, t = downs[0][1] else: p = t.text, pp_nn = [(p, None)] ppp_nnn.append(pp_nn) varg_root_indexes.append(t.index) continue else: prep = None r = self.recognize_verb_arg(t) if r is None: continue pp_nn = r """ if rel == 'npadvmod': for i, (p, n) in enumerate(pp_nn): if not p: pp_nn[i] = (TIME_PREP,), n """ spoken_preps = [prep] * len(pp_nn) absorbed_preps, vargs = zip(*pp_nn) if pp_nn else ([], []) pp_nn = [] for spoken_prep, absorbed_prep, varg in \ zip(spoken_preps, absorbed_preps, vargs): if spoken_prep: prep = spoken_prep elif absorbed_prep: prep = absorbed_prep else: prep = None pp_nn.append((prep, varg)) ppp_nnn.append(pp_nn) varg_root_indexes.append(t.index) r = self.find_subject(verb_span_pair, varg_root_indexes) if r is None: return None subj_argx, vmain_index = r return subj_argx, vmain_index, ppp_nnn, adverbs def conjs_from_verb(self, v): """ recognized verb with wildcards -> possible conjugations """ # If the verb's field is a wildcard, it could be any of them. if v.conj == None: v_conjs = Conjugation.values else: v_conjs = set([v.conj]) # If imperative, it just be conjugated second person. if v.intrinsics.modality.flavor == ModalFlavor.IMPERATIVE: v_conjs &= set([Conjugation.S2, Conjugation.P2]) return v_conjs def conjs_from_verb_args(self, pp_nn, subj_argx): """ verb arguments -> possible conjugations """ # It's possible to have no subject, in the case of imperatives. In that # case, choose second person. if subj_argx is None: return set([Conjugation.S2, Conjugation.P2]) # Get the required conjugation from the subject. subj_n = pp_nn[subj_argx][1] conj = subj_n.decide_conjugation(self.say_state, self.arbitrary_idiolect, self.subject_say_context) # In case of existential there, get conjugation from the object instead. if not conj: x = subj_argx + 1 if not (0 <= x < len(pp_nn)): return set([]) # Ex-there but no object = can't parse it. conj = pp_nn[x][1].decide_conjugation(self.say_state, self.arbitrary_idiolect, self.subject_say_context) return set([conj]) def possible_conjugations(self, v, pp_nn, subj_argx): """ verb and arguments -> possible conjugations Verb agreement. """ v_conjs = self.conjs_from_verb(v) n_conjs = self.conjs_from_verb_args(pp_nn, subj_argx) return v_conjs & n_conjs def recognize_clause(self, root_token, is_root_clause): """ root token -> yields SurfaceContentClause """ cc = [] for verb_span_pair, vv in \ self.verb_extractor.extract(root_token, is_root_clause): # Hack to compensate for a bug in imperative verb saying. if verb_span_pair[0] and not verb_span_pair[1]: vv = filter(lambda v: not v.is_imperative(), vv) if not vv: continue r = self.extract_verb_args(root_token, verb_span_pair) if r is None: continue subj_argx, vmain_index, ppp_nnn, adverbs = r if subj_argx is None: vv = filter(lambda v: v.is_imperative(), vv) else: assert 0 <= subj_argx < len(ppp_nnn) for v in vv: for pp_nn in product(*ppp_nnn): pp_nn = list(pp_nn) for conj in self.possible_conjugations( v, pp_nn, subj_argx): complementizer = Complementizer.ZERO new_v = deepcopy(v) new_v.conj = conj c = SurfaceContentClause(complementizer, new_v, adverbs, deepcopy(pp_nn), vmain_index) cc.append(c) return cc def recog(self, parse): """ Parse -> yields SurfaceSentence """ assert isinstance(parse, Parse) assert parse.tokens end_punct = self.end_punct_clf.classify(parse.tokens[-1].text) for clause in self.recognize_clause(parse.root, is_root_clause=True): if '?' in end_punct and clause.verb.is_imperative(): continue yield SurfaceSentence(clause, end_punct)