def collapse_graph(gr): # prepositions prep_edges = find_edges(graph=gr, filterFunc=lambda (u, v): gr.edge_label( (u, v)) == "prep" and gr.neighbors(v) == 1) for u, v in prep_edges: pobj = gr.neighbors(v)[0] gr.add_edge((u, pobj), "prep_" + v.text[0].word.lower()) gr.del_node(v) # conjunctions conj_edges = find_edges( graph=gr, filterFunc=lambda (u, v): gr.edge_label( (u, v)) == "conj" and len(u.neighbors().get("cc", [])) == 1) toDel = [] for u, v in conj_edges: cc = u.neighbors()['cc'][0] if len(gr.neighbors(cc)) == 0: gr.del_edge((u, v)) gr.add_edge((u, v), "conj_" + cc.text[0].word.lower()) toDel.append(cc) for n in set(toDel): gr.del_node(n) return gr
def _do_conditionals(self): # find conditionals constructions edges = find_edges( self, lambda u_v12: (self.edge_label( (u_v12[0], u_v12[1])) == "mark") and (u_v12[1].text[0].word.lower() in ["if", "while", "because", "although", "as", "once"])) for (markFather, markNode) in edges: neighbors = markFather.neighbors() incidents = markFather.incidents() advclNode = False if "advcl" in neighbors: advclNode = neighbors["advcl"][0] toDel = (markFather, advclNode) head = markFather elif "advcl" in incidents: advclNode = incidents["advcl"][0] toDel = (advclNode, markFather) head = advclNode if advclNode: if "advcl" in advclNode.incidents(): continue head = self.head(head) self.del_edge(toDel) self.del_edge((markFather, markNode)) if "rcmod" not in head.incidents(): for father in self.incidents(head): duplicateEdge(graph=self, orig=(father, head), new=(father, markNode)) self.del_edge((father, head)) self.conditional_specific(markNode, markFather, advclNode) markNode.isPredicate = True return True
def do_questions(self): """ Identify questions and introduce appropriate structure This currently follows the syntactic format of geo query wh-questions. Such as: "How large is Texas?" Where we have a WH question word ("How") dependent of a modifier of some property ("Large") """ # Find relevant edges edges = find_edges(self, lambda (u, v): v.is_wh_question()) # Handle each separately for (modifier, wh_question) in edges: self.types.add("Questions") # 1. Remove dep edge self.del_edge((modifier, wh_question)) # 2. Posit the wh question as head of the embedded clause self.add_edge(edge = (wh_question, find_top_of_component(self, modifier)), label = QUESTION_INQUIRY) # 3. Mark that the Wh-question node is a predicate wh_question.isPredicate = True
def do_acomp(self): edges = find_edges(self, lambda((u, v)):self.edge_label((u, v)) == "acomp" and u.isPredicate) for predNode,acompNode in edges: neighbors = predNode.neighbors() subjs = multi_get(neighbors,subject_dependencies) if len(subjs)!=1: # self.types.add("debug") pass else: if (predNode.text[0].word in self.modalVerbs) or (predNode.features.get("Lemma","") in self.modalVerbs): subj = subjs[0] self.del_edge((predNode,acompNode)) self.add_edge((acompNode,subj),label=domain_label) acompNode.isPredicate=True self.del_edge((predNode,subj)) duplicate_all_incidents(gr=self, source=predNode, target=acompNode) self.add_edge((acompNode,predNode),label=SOURCE_LABEL) if (len(self.neighbors(predNode))==0) and (len(predNode.text)==1) and (predNode.text[0].word in contractions): self.del_node(predNode) else: self.types.add("acomp_as_modal") else: self.types.add("acomp_as_mwe") merge_nodes(gr=self, node1=predNode, node2=acompNode)
def _do_conditionals(self): # find conditionals constructions edges = find_edges(self, lambda((u, v)):(self.edge_label((u, v)) == "mark") and (v.text[0].word.lower() in ["if","while","because","although","as","once"])) for (markFather,markNode) in edges: neighbors = markFather.neighbors() incidents = markFather.incidents() advclNode = False if "advcl" in neighbors: advclNode = neighbors["advcl"][0] toDel = (markFather,advclNode) head = markFather elif "advcl" in incidents: advclNode = incidents["advcl"][0] toDel = (advclNode,markFather) head = advclNode if advclNode: if "advcl" in advclNode.incidents(): continue head = self.head(head) self.del_edge(toDel) self.del_edge((markFather,markNode)) if "rcmod" not in head.incidents(): for father in self.incidents(head): duplicateEdge(graph=self, orig=(father,head), new=(father,markNode)) self.del_edge((father,head)) self.conditional_specific(markNode, markFather, advclNode) markNode.isPredicate = True return True
def remove_aux(self): edges = find_edges(self, lambda edge: self.edge_label(edge) in ignore_labels) for u, v in edges: if v.uid in self.nodesMap: u.original_text.extend(v.original_text) self.del_node(v)
def do_acomp(self): edges = find_edges( self, lambda u_v11: self.edge_label( (u_v11[0], u_v11[1])) == "acomp" and u_v11[0].isPredicate) for predNode, acompNode in edges: neighbors = predNode.neighbors() subjs = multi_get(neighbors, subject_dependencies) if len(subjs) != 1: # self.types.add("debug") pass else: if (predNode.text[0].word in self.modalVerbs) or (predNode.features.get( "Lemma", "") in self.modalVerbs): subj = subjs[0] self.del_edge((predNode, acompNode)) self.add_edge((acompNode, subj), label=domain_label) acompNode.isPredicate = True self.del_edge((predNode, subj)) duplicate_all_incidents(gr=self, source=predNode, target=acompNode) self.add_edge((acompNode, predNode), label=SOURCE_LABEL) if (len(self.neighbors(predNode)) == 0) and (len( predNode.text) == 1) and (predNode.text[0].word in contractions): self.del_node(predNode) else: self.types.add("acomp_as_modal") else: self.types.add("acomp_as_mwe") merge_nodes(gr=self, node1=predNode, node2=acompNode)
def fixExistensials(self): """ Generate existensials structure """ explEdges = find_edges( graph=self.gr, filterFunc=lambda edge: self.gr.edge_label(edge) == EXPL_LABEL) for (topNode, expl) in explEdges: subjNodes = deref(graph=self.gr, node=topNode, rel=subject_dependencies) if len(subjNodes) != 1: continue self.types.add(APPENDIX_EXISTENSIALS) self.gr.del_node(expl) subjNode = subjNodes[0] for curNeigbour in [ n for n in self.gr.neighbors(topNode) if n != subjNode ]: self.gr.add_edge(edge=(subjNode, curNeigbour), label=self.gr.edge_label( (topNode, curNeigbour))) self.gr.del_edge((topNode, curNeigbour)) topNode.text[0].word = EXISTENSIAL topNode.features = {}
def do_vmod_relclause(self): edges = find_edges(self, lambda (u, v):(self.edge_label((u, v)) == "rcmod")) for (u, v) in edges: v.features["top"] = True if u.pos() in determined_labels: self.del_edge((u, v)) self.types.add("definite rcmod") if not self.has_edge((v, u)): self.add_edge((v, u), label=ARG_LABEL) edges = find_edges(self, lambda (u, v):(self.edge_label((u, v)) == "vmod")) for (u, v) in edges: self.types.add("vmod") if u.pos() in determined_labels: self.del_edge((u, v)) self.types.add("definite vmod") if not self.has_edge((v, u)): self.add_edge((v, u), label=ARG_LABEL)
def do_poss(self): edges = find_edges(self, lambda (u, v):self.edge_label((u, v)) == "poss") for (possessed, possessor) in edges: self.types.add("Possessives") possessiveNode = getPossesive(self, possessor.minIndex()) # TODO: refine index self.add_edge(edge=(possessiveNode, possessor), label=POSSESSOR_LABEL) self.add_edge(edge=(possessiveNode, possessed), label=POSSESSED_LABEL)
def do_existensials(self): edges = find_edges(self, lambda((u, v)):self.edge_label((u, v)) == "expl" and len(self.neighbors(v)) == 0) for (u, v) in edges: self.types.add("existensials") u.text = deepcopy(u.text) u.text[0].word = EXISTENSIAL u.removeLemma() u.surface_form += v.surface_form u.features["implicit"] = True self.del_node(v)
def do_poss(self): edges = find_edges( self, lambda u_v8: self.edge_label((u_v8[0], u_v8[1])) == "poss") for (possessed, possessor) in edges: self.types.add("Possessives") possessiveNode = getPossesive( self, possessor.minIndex()) # TODO: refine index self.add_edge(edge=(possessiveNode, possessor), label=POSSESSOR_LABEL) self.add_edge(edge=(possessiveNode, possessed), label=POSSESSED_LABEL)
def collapse_graph(gr): # prepositions prep_edges = find_edges(graph=gr, filterFunc = lambda (u,v): gr.edge_label((u,v))=="prep" and gr.neighbors(v)==1) for u,v in prep_edges: pobj = gr.neighbors(v)[0] gr.add_edge((u,pobj),"prep_"+v.text[0].word.lower()) gr.del_node(v) # conjunctions conj_edges = find_edges(graph=gr, filterFunc = lambda (u,v): gr.edge_label((u,v))=="conj" and len(u.neighbors().get("cc",[]))==1) toDel = [] for u,v in conj_edges: cc = u.neighbors()['cc'][0] if len(gr.neighbors(cc))==0: gr.del_edge((u,v)) gr.add_edge((u,v),"conj_"+cc.text[0].word.lower()) toDel.append(cc) for n in set(toDel): gr.del_node(n) return gr
def do_vmod_relclause(self): edges = find_edges( self, lambda u_v6: (self.edge_label( (u_v6[0], u_v6[1])) == "rcmod")) for (u, v) in edges: v.features["top"] = True if u.pos() in determined_labels: self.del_edge((u, v)) self.types.add("definite rcmod") if not self.has_edge((v, u)): self.add_edge((v, u), label=ARG_LABEL) edges = find_edges( self, lambda u_v7: (self.edge_label((u_v7[0], u_v7[1])) == "vmod")) for (u, v) in edges: self.types.add("vmod") if u.pos() in determined_labels: self.del_edge((u, v)) self.types.add("definite vmod") if not self.has_edge((v, u)): self.add_edge((v, u), label=ARG_LABEL)
def do_existensials(self): edges = find_edges( self, lambda u_v13: self.edge_label( (u_v13[0], u_v13[1])) == "expl" and len( self.neighbors(u_v13[1])) == 0) for (u, v) in edges: self.types.add("existensials") u.text = deepcopy(u.text) u.text[0].word = EXISTENSIAL u.removeLemma() u.surface_form += v.surface_form u.features["implicit"] = True self.del_node(v)
def _fix(self): # remove mark->that edges = find_edges(self, lambda (u, v):self.edge_label((u, v)) == "mark") for (u, v) in edges: if (len(self.neighbors(v)) == 0) and (len(v.text) == 1) and (v.text[0].word == "that"): self.del_node(v) return True # rcmod with no relation to father edges = find_edges(self, lambda (u, v):(self.edge_label((u, v)) == "rcmod") and (not self.has_edge((v, u)))) for u, v in edges: self.add_edge((v, u), label=ARG_LABEL) return True # prep collapse edges = find_edges(self, lambda (u, v):(self.edge_label((u, v)) == "prep") and (len(self.neighbors(v)) == 1) and ("pobj" in v.neighbors())) if edges: for (u, v) in edges: pobj = v.neighbors()["pobj"][0] if not (self.has_edge((u, pobj))): w = v.text[0] u.surface_form += [w] self.add_edge((u, pobj), label="prep_" + w.word) self.del_node(v) # fix dependency collapse bugs edges = find_edges(self, lambda (u, v):(self.edge_label((u, v)) == "pobj") and ("prep" not in u.incidents())) for (u, v) in sorted(edges,key=lambda((u,v)): u.minIndex()): neighbors = u.neighbors() candidates = [n for n in multi_get(neighbors, [rel for rel in neighbors if rel.startswith("prepc_")]) if len(self.neighbors(n)) == 0] candidates.sort(key=lambda n:n.minIndex()) if len(candidates) > 0: curToDel = candidates[0] rel = self.edge_label((u, curToDel)) self.del_edge((u, v)) self.add_edge((u, v), label=rel) self.del_node(curToDel)
def _merge(self): edges = find_edges(self, lambda (u,v):(self.edge_label((u,v)) in join_labels) or (self.edge_label((u,v))=="conj_and" and u.features.get("conjType",[""])[0]=='&')) for u, v in edges: conjType = u.features.get("conjType",False) if conjType: conjType = conjType[0] #only the words matching = [w for w in u.surface_form if w.word == conjType] if matching: w = matching[0] else: w = Word(index = u.maxIndex()+1,word=conjType) u.text.append(w) merge_nodes(self, u, v) return True return False
def do_conj(self): edges = find_edges(self, lambda((u, v)):self.edge_label((u, v)).startswith("conj_"))# and (not u.isPredicate) and (not v.isPredicate)) nodes = set([u for (u,_) in edges]) for conj1 in nodes: curStartIndex = conj1.minIndex()+1 curNeighbours = conj1.neighbors() isModifier = (not bool([father for father in self.incidents(conj1) if not self.is_aux_edge((father.uid, conj1.uid))])) and bool(self.incidents(conj1)) for rel in [rel for rel in curNeighbours if rel.startswith("conj_")]: marker = rel.split("conj_")[1] markerNode = newNode.Node(text=[Word(curStartIndex+1,marker)], #TODO: how to find marker's index isPredicate=True, features={"conj":True}, gr=self) #decide how to connect it to the rest of the graph, based on its type if isModifier: duplicate_all_incidents(gr=self, source=conj1, target=markerNode) else: for father in self.incidents(conj1): for conj2 in curNeighbours[rel]: duplicateEdge(graph=self, orig=((father,conj1)), new=((father,conj2))) duplicateEdge(graph=self, orig=((father,conj1)), new=((father,markerNode))) if conj1.isPredicate: for neighbor in self.neighbors(conj1): if get_min_max_span(self, neighbor)[0] < curStartIndex: for conj2 in curNeighbours[rel]: if (self.edge_label((conj1,neighbor)) == SOURCE_LABEL) or (not self.is_aux_edge((conj1.uid, neighbor.uid))): duplicateEdge(graph=self, orig=(conj1,neighbor), new=(conj2,neighbor)) # create the coordination construction, headed by the marker self.add_edge(edge=(markerNode,conj1),label=rel) for conj2 in curNeighbours[rel]: self.del_edge((conj1,conj2)) self.add_edge(edge=(markerNode,conj2),label=rel) if conj1.isPredicate: conj2.isPredicate = conj1.isPredicate conj1.surface_form = [w for w in conj1.surface_form if (w not in conj2.surface_form) and (w not in conj1.text) ] for w in conj1.text: if w not in conj1.surface_form: conj1.surface_form.append(w) if conj1.features.get("conjType",False): conj1.text = [w for w in conj1.text if w.index not in conj1.features["conjType"][1]] self.types.add(rel)
def fixProps(self): """ Fix cases of conjunction of properties in indefinite nominals """ edges = find_edges(graph = self.gr, filterFunc = lambda (u,v): (not isDefinite(u)) and (isProp(v)or isRcmodProp(v)) and (not v.is_prenominal())) for counter,(u,v) in enumerate(sorted(edges,key= lambda (_,propNode):get_min_max_span(self.gr,propNode)[0])): curLabel = self.gr.edge_label((u,v)) self.gr.del_edge((u,v)) self.gr.add_edge(edge =(u,v), label = ";".join([curLabel,str(counter+1)]))
def fixExistensials(self): """ Generate existensials structure """ explEdges = find_edges(graph = self.gr, filterFunc = lambda edge: self.gr.edge_label(edge) == EXPL_LABEL) for (topNode,expl) in explEdges: subjNodes = deref(graph=self.gr, node=topNode, rel= subject_dependencies) if len(subjNodes)!=1: continue self.types.add(APPENDIX_EXISTENSIALS) self.gr.del_node(expl) subjNode = subjNodes[0] for curNeigbour in [n for n in self.gr.neighbors(topNode) if n != subjNode]: self.gr.add_edge(edge = (subjNode,curNeigbour), label = self.gr.edge_label((topNode,curNeigbour))) self.gr.del_edge((topNode,curNeigbour)) topNode.text[0].word = EXISTENSIAL topNode.features = {}
def _merge(self): edges = find_edges( self, lambda u_v: (self.edge_label( (u_v[0], u_v[1])) in join_labels) or (self.edge_label( (u_v[0], u_v[1])) == "conj_and" and u_v[0].features.get( "conjType", [""])[0] == '&')) for u, v in edges: conjType = u.features.get("conjType", False) if conjType: conjType = conjType[0] #only the words matching = [w for w in u.surface_form if w.word == conjType] if matching: w = matching[0] else: w = Word(index=u.maxIndex() + 1, word=conjType) u.text.append(w) merge_nodes(self, u, v) return True return False
def fixProps(self): """ Fix cases of conjunction of properties in indefinite nominals """ edges = find_edges(graph=self.gr, filterFunc=lambda (u, v): (not isDefinite(u)) and (isProp(v) or isRcmodProp(v)) and (not v.is_prenominal())) for counter, (u, v) in enumerate( sorted(edges, key=lambda (_, propNode): get_min_max_span(self.gr, propNode)[0])): curLabel = self.gr.edge_label((u, v)) self.gr.del_edge((u, v)) self.gr.add_edge(edge=(u, v), label=";".join([curLabel, str(counter + 1)]))
def do_questions(self): """ Identify questions and introduce appropriate structure This currently follows the syntactic format of geo query wh-questions. Such as: "How large is Texas?" Where we have a WH question word ("How") dependent of a modifier of some property ("Large") """ # Find relevant edges edges = find_edges(self, lambda u_v9: u_v9[1].is_wh_question()) # Handle each separately for (modifier, wh_question) in edges: self.types.add("Questions") # 1. Remove dep edge self.del_edge((modifier, wh_question)) # 2. Posit the wh question as head of the embedded clause self.add_edge(edge=(wh_question, find_top_of_component(self, modifier)), label=QUESTION_INQUIRY) # 3. Mark that the Wh-question node is a predicate wh_question.isPredicate = True
def inner(): change = False # 1,2 nodes = find_nodes(self.gr, isCondition) nodes.extend(find_nodes(self.gr, isPreposition)) for curNode in nodes: sisterNodes = sister_nodes(graph=self.gr, node=curNode) for sisterNode in sisterNodes: if isProp(sisterNode) and is_following( graph=self.gr, node1=sisterNode, node2=curNode): reattch(graph=self.gr, node=curNode, new_father=sisterNode) return True break # 3 nodes = find_nodes(self.gr, isAdverb) for curNode in nodes: sisterNodes = sister_nodes(graph=self.gr, node=curNode) for sisterNode in sisterNodes: if isProp(sisterNode) and is_following( graph=self.gr, node1=curNode, node2=sisterNode): reattch(graph=self.gr, node=curNode, new_father=sisterNode) return True break #4 nodes = find_nodes( self.gr, lambda n: isCondition(n) and n.text[0].word == "{0}-{1}".format(COND, 'that')) for curNode in nodes: curFathers = self.gr.incidents(curNode) curChildren = self.gr.neighbors(curNode) for curFather in curFathers: for curChild in curChildren: self.gr.add_edge(edge=(curFather, curChild), label="that") self.gr.del_node(curNode) change = True #5 filterFunc = lambda n: isConjunction(n) and len( self.gr.incidents(n) ) == 1 and isConjunction(self.gr.incidents(n)[0]) and ( n.conjType == self.gr.incidents(n)[0].conjType ) #TODO: efficiency - multiple calls to incidents and a lot of deref nodes = find_nodes(self.gr, filterFunc) for curNode in nodes: curFather = self.gr.incidents(curNode)[0] for curChild in self.gr.neighbors(curNode): self.gr.add_edge((curFather, curChild)) self.gr.del_node(curNode) change = True #6 nodes = find_nodes( self.gr, lambda n: len(n.text) == 1 and n.text[0].word == "able") for curNode in nodes: curFathers = self.gr.incidents(curNode) if len(curFathers) == 1: curChildren = self.gr.neighbors(curNode) if len(curChildren) == 1: child = curChildren[0] if child.isPredicate and (self.gr.edge_label( (curNode, child)) == "xcomp"): father = curFathers[0] self.gr.add_edge(edge=(father, child), label=self.gr.edge_label( (father, curNode))) child.features["Modal"] = { "Value": ['able'] } #TODO: is this maybe overrun previous modals? self.gr.del_node(curNode) change = True #7 edges = find_edges( self.gr, lambda (u, v): isTime(u) and isTime(v) and len( self.gr.neighbors(u)) == 1) for curFather, curSon in edges: for curNode in self.gr.neighbors(curSon): self.gr.add_edge(edge=(curFather, curNode), label=self.gr.edge_label( (curSon, curNode))) self.gr.del_node(curSon) return True #8 edges = find_edges( self.gr, lambda (u, v): (isTime(v) or isLocation(v)) and isPreposition(u) and u.is_time_prep()) for prepNode, timeNode in edges: if (len(self.gr.neighbors(prepNode)) == 1): # time node is only son - attach time to all of prep incidents for curFather in self.gr.incidents(prepNode): self.gr.add_edge(edge=(curFather, timeNode), label=self.gr.edge_label( (curFather, prepNode))) self.gr.del_node(prepNode) change = True #9 conjNodes = find_nodes( self.gr, lambda n: isConjunction(n) and n.conjType.lower() == "and") for conjNode in conjNodes: curParents = [] curChildren = self.gr.neighbors(conjNode) for curChild in curChildren: curParents.extend([ parent for parent in self.gr.incidents(curChild) if parent != conjNode ]) if len(curParents) == 1: parent = curParents[0] if isProp(parent): # found a prop->conj construction # connect all prop to parent of conj and remove the conj node for child in curChildren: if not (parent, child) in self.gr.edges(): self.gr.add_edge(edge=(parent, child)) self.gr.del_node(conjNode) change = True #10 change = change or self.fixRanges() #11 edges = find_edges( self.gr, lambda (u, v): self.gr.edge_label( (u, v)) == "loc" and len(self.gr.neighbors(u)) > 1) for topNode, loc in edges: for curNeigbor in self.gr.neighbors(topNode): if curNeigbor != loc: duplicateEdge(graph=self.gr, orig=(topNode, curNeigbor), new=(loc, curNeigbor)) for curFather in self.gr.incidents(topNode): duplicateEdge(graph=self.gr, orig=(curFather, topNode), new=(curFather, loc)) self.gr.del_node(topNode) self.types.remove(APPENDIX_LOCATION) change = True #12 edges = find_edges(graph=self.gr, filterFunc=lambda (u, v): isProp(u) and isLocation(v)) for _, locNode in edges: for curFather in self.gr.incidents(locNode): for curNeighbour in self.gr.neighbors(locNode): duplicateEdge(graph=self.gr, orig=(locNode, curNeighbour), new=(curFather, curNeighbour)) self.gr.del_node(locNode) self.types.remove(APPENDIX_LOCATION) change = True #13 edges = find_edges(graph=self.gr, filterFunc=lambda (u, v): isProp(u) and v.isPredicate and (len(self.gr.neighbors(v)) == 0) and (len(self.gr.incidents(u)) == 1) and (len(self.gr.neighbors(u)) == 1)) for propNode, predNode in edges: change = True curFather = self.gr.incidents(propNode)[0] if not isApposition(curFather): jointNode = node.join(node1=curFather, node2=predNode, gr=self.gr) curFather.text = jointNode.text self.gr.del_nodes([propNode, predNode]) else: self.gr.del_node(propNode) self.gr.add_edge((predNode, curFather)) for curIncident in self.gr.incidents(curFather): duplicateEdge(graph=self.gr, orig=(curIncident, curFather), new=(curIncident, predNode)) self.gr.del_edge((curIncident, curFather)) #14 propNodes = find_nodes( self.gr, lambda n: isProp(n) and len(self.gr.incidents(n)) == 1) for propNode in propNodes: curFather = self.gr.incidents(propNode)[0] if ((len(curFather.str) == 1) and (not isCopular(curFather)) and (curFather.str[0].word == "be" or curFather.str[0].word in contractions)) or ( (isProp(curFather) or isRcmodProp(curFather)) and len(self.gr.neighbors(curFather)) == 1): if len(self.gr.incidents(curFather)) == 1: curAncestor = self.gr.incidents(curFather)[0] duplicateEdge(graph=self.gr, orig=(curAncestor, curFather), new=(curAncestor, propNode)) self.gr.del_node(curFather) # this node no longer describes the "be" relation propNode.parent_relation = '' return True #15 edges = find_edges( graph=self.gr, filterFunc=lambda (u, v): isProp(v) and (v.parent_relation == "acomp") and len( self.gr.neighbors(v)) == 1 and u.isPredicate) for pred, prop in edges: acompNode = self.gr.neighbors(prop)[0] duplicateEdge(graph=self.gr, orig=(pred, prop), new=(pred, acompNode), newLabel="modifier") self.gr.del_node( prop) # TODO: could there be others connected to it? newPred = node.join(pred, acompNode, self.gr) newPred.isPredicate = True self.gr.add_node(newPred) for neigbour in self.gr.neighbors(pred): duplicateEdge(graph=self.gr, orig=(pred, neigbour), new=(newPred, neigbour)) for curFather in self.gr.incidents(pred): duplicateEdge(graph=self.gr, orig=(curFather, pred), new=(curFather, newPred)) if len(self.gr.neighbors(acompNode)) == 0: self.gr.del_node(acompNode) self.gr.del_node(pred) # newPred.features["debug"] =True #TODO: remove this self.types.add("ACOMP") return True #16 edges = find_edges(graph=self.gr, filterFunc=lambda (u, v): (isProp(v) or isRcmodProp(v)) and (u in self.gr.neighbors(v))) for _, v in edges: if (len(self.gr.neighbors(v)) == 1): self.gr.del_node(v) return True #17 edges = find_edges(graph=self.gr, filterFunc=lambda (u, v): self.gr.edge_label( (u, v)) == SOURCE_LABEL and (len(self.gr.neighbors(v)) == 0)) for _, v in edges: curStr = " ".join([w.word for w in v.text]) if curStr in contractions: self.gr.del_node(v) return True #18 - verbal complements edges = find_edges(graph=self.gr, filterFunc=lambda (u, v): self.gr.edge_label( (u, v)) == 'ccomp' and u.isPredicate) for u, v in edges: self.gr.del_edge((u, v)) self.gr.add_edge(edge=(u, v), label='dobj') v.features["debug"] = True self.types.add("DEBUG") return True return change
candidates = [ n for n in multi_get( neighbors, [rel for rel in neighbors if rel.startswith("prepc_")]) if len(self.neighbors(n)) == 0 ] candidates.sort(key=lambda n: n.minIndex()) if len(candidates) > 0: curToDel = candidates[0] rel = self.edge_label((u, curToDel)) self.del_edge((u, v)) self.add_edge((u, v), label=rel) self.del_node(curToDel) # change agent edges with "prep_by" edges = find_edges(self, lambda edge: (self.edge_label(edge) == "agent")) for edge in edges: self.del_edge(edge) self.add_edge(edge, label="prep_by") # #add xcomp inverse node # edges = find_edges(self, lambda (u,v):self.edge_label((u,v)) == "xcomp" and u.isPredicate and v.isPredicate) # for (u,v) in edges: # if not self.has_edge((v, u)): # self.add_edge((v,u), label=SOURCE_LABEL) # self.types.add("infinitives") # return True # if not multi_get(v.neighbors(),subject_dependencies): # rcmodParentIncidents = u.incidents().get("rcmod",[]) # if len(rcmodParentIncidents)==1: # subj = rcmodParentIncidents[0]
def inner(): change = False # 1,2 nodes = find_nodes(self.gr, isCondition) nodes.extend(find_nodes(self.gr, isPreposition)) for curNode in nodes: sisterNodes = sister_nodes(graph=self.gr, node=curNode) for sisterNode in sisterNodes: if isProp(sisterNode) and is_following(graph=self.gr, node1=sisterNode, node2=curNode): reattch(graph=self.gr, node=curNode, new_father=sisterNode) return True break # 3 nodes = find_nodes(self.gr, isAdverb) for curNode in nodes: sisterNodes = sister_nodes(graph=self.gr, node=curNode) for sisterNode in sisterNodes: if isProp(sisterNode) and is_following(graph=self.gr, node1=curNode, node2=sisterNode): reattch(graph=self.gr, node=curNode, new_father=sisterNode) return True break #4 nodes = find_nodes(self.gr, lambda n:isCondition(n) and n.text[0].word == "{0}-{1}".format(COND,'that')) for curNode in nodes: curFathers = self.gr.incidents(curNode) curChildren = self.gr.neighbors(curNode) for curFather in curFathers: for curChild in curChildren: self.gr.add_edge(edge = (curFather,curChild), label = "that") self.gr.del_node(curNode) change = True #5 filterFunc = lambda n:isConjunction(n) and len(self.gr.incidents(n)) == 1 and isConjunction(self.gr.incidents(n)[0]) and (n.conjType == self.gr.incidents(n)[0].conjType) #TODO: efficiency - multiple calls to incidents and a lot of deref nodes = find_nodes(self.gr,filterFunc) for curNode in nodes: curFather = self.gr.incidents(curNode)[0] for curChild in self.gr.neighbors(curNode): self.gr.add_edge((curFather,curChild)) self.gr.del_node(curNode) change = True #6 nodes = find_nodes(self.gr, lambda n:len(n.text)==1 and n.text[0].word == "able") for curNode in nodes: curFathers = self.gr.incidents(curNode) if len(curFathers)==1: curChildren = self.gr.neighbors(curNode) if len(curChildren) ==1: child = curChildren[0] if child.isPredicate and (self.gr.edge_label((curNode,child))=="xcomp"): father = curFathers[0] self.gr.add_edge(edge=(father,child), label=self.gr.edge_label((father,curNode))) child.features["Modal"]={"Value":['able']} #TODO: is this maybe overrun previous modals? self.gr.del_node(curNode) change=True #7 edges = find_edges(self.gr, lambda (u,v):isTime(u)and isTime(v) and len(self.gr.neighbors(u))==1) for curFather,curSon in edges: for curNode in self.gr.neighbors(curSon): self.gr.add_edge(edge=(curFather,curNode), label = self.gr.edge_label((curSon,curNode))) self.gr.del_node(curSon) return True #8 edges = find_edges(self.gr, lambda (u,v):(isTime(v) or isLocation(v)) and isPreposition(u) and u.is_time_prep()) for prepNode,timeNode in edges: if (len(self.gr.neighbors(prepNode))==1): # time node is only son - attach time to all of prep incidents for curFather in self.gr.incidents(prepNode): self.gr.add_edge(edge=(curFather,timeNode), label = self.gr.edge_label((curFather,prepNode))) self.gr.del_node(prepNode) change=True #9 conjNodes = find_nodes(self.gr, lambda n: isConjunction(n) and n.conjType.lower() == "and") for conjNode in conjNodes: curParents = [] curChildren = self.gr.neighbors(conjNode) for curChild in curChildren: curParents.extend([parent for parent in self.gr.incidents(curChild) if parent != conjNode]) if len(curParents)==1: parent = curParents[0] if isProp(parent): # found a prop->conj construction # connect all prop to parent of conj and remove the conj node for child in curChildren: if not (parent,child) in self.gr.edges(): self.gr.add_edge(edge = (parent,child)) self.gr.del_node(conjNode) change = True #10 change = change or self.fixRanges() #11 edges = find_edges(self.gr, lambda (u,v):self.gr.edge_label((u,v))=="loc" and len(self.gr.neighbors(u))>1) for topNode,loc in edges: for curNeigbor in self.gr.neighbors(topNode): if curNeigbor != loc: duplicateEdge(graph=self.gr, orig=(topNode,curNeigbor), new=(loc,curNeigbor)) for curFather in self.gr.incidents(topNode): duplicateEdge(graph=self.gr, orig=(curFather,topNode), new=(curFather,loc)) self.gr.del_node(topNode) self.types.remove(APPENDIX_LOCATION) change=True #12 edges = find_edges(graph=self.gr, filterFunc = lambda (u,v): isProp(u) and isLocation(v)) for _,locNode in edges: for curFather in self.gr.incidents(locNode): for curNeighbour in self.gr.neighbors(locNode): duplicateEdge(graph=self.gr, orig=(locNode,curNeighbour), new=(curFather,curNeighbour)) self.gr.del_node(locNode) self.types.remove(APPENDIX_LOCATION) change=True #13 edges = find_edges(graph=self.gr, filterFunc = lambda (u,v): isProp(u) and v.isPredicate and (len(self.gr.neighbors(v)) ==0) and (len(self.gr.incidents(u)) ==1) and (len(self.gr.neighbors(u)) ==1)) for propNode,predNode in edges: change = True curFather = self.gr.incidents(propNode)[0] if not isApposition(curFather): jointNode = node.join(node1=curFather, node2=predNode, gr=self.gr) curFather.text = jointNode.text self.gr.del_nodes([propNode,predNode]) else: self.gr.del_node(propNode) self.gr.add_edge((predNode,curFather)) for curIncident in self.gr.incidents(curFather): duplicateEdge(graph=self.gr, orig=(curIncident,curFather), new=(curIncident,predNode)) self.gr.del_edge((curIncident,curFather)) #14 propNodes = find_nodes(self.gr, lambda n:isProp(n) and len(self.gr.incidents(n))==1) for propNode in propNodes: curFather = self.gr.incidents(propNode)[0] if ((len(curFather.str)==1) and (not isCopular(curFather)) and (curFather.str[0].word == "be" or curFather.str[0].word in contractions)) or ((isProp(curFather) or isRcmodProp(curFather)) and len(self.gr.neighbors(curFather))==1): if len(self.gr.incidents(curFather))==1: curAncestor = self.gr.incidents(curFather)[0] duplicateEdge(graph=self.gr, orig=(curAncestor,curFather), new=(curAncestor,propNode)) self.gr.del_node(curFather) # this node no longer describes the "be" relation propNode.parent_relation = '' return True #15 edges = find_edges(graph=self.gr, filterFunc = lambda (u,v): isProp(v) and (v.parent_relation == "acomp") and len(self.gr.neighbors(v))==1 and u.isPredicate) for pred, prop in edges: acompNode = self.gr.neighbors(prop)[0] duplicateEdge(graph=self.gr, orig=(pred,prop), new=(pred,acompNode), newLabel = "modifier") self.gr.del_node(prop) # TODO: could there be others connected to it? newPred = node.join(pred,acompNode,self.gr) newPred.isPredicate =True self.gr.add_node(newPred) for neigbour in self.gr.neighbors(pred): duplicateEdge(graph=self.gr, orig=(pred,neigbour), new=(newPred,neigbour)) for curFather in self.gr.incidents(pred): duplicateEdge(graph=self.gr, orig=(curFather,pred), new=(curFather,newPred)) if len(self.gr.neighbors(acompNode))==0: self.gr.del_node(acompNode) self.gr.del_node(pred) # newPred.features["debug"] =True #TODO: remove this self.types.add("ACOMP") return True #16 edges = find_edges(graph=self.gr, filterFunc = lambda (u,v): (isProp(v) or isRcmodProp(v)) and (u in self.gr.neighbors(v))) for _,v in edges: if (len(self.gr.neighbors(v))==1): self.gr.del_node(v) return True #17 edges = find_edges(graph=self.gr, filterFunc = lambda (u,v): self.gr.edge_label((u,v))==SOURCE_LABEL and (len(self.gr.neighbors(v))==0)) for _,v in edges: curStr = " ".join([w.word for w in v.text]) if curStr in contractions: self.gr.del_node(v) return True #18 - verbal complements edges = find_edges(graph=self.gr, filterFunc = lambda (u,v): self.gr.edge_label((u,v))=='ccomp' and u.isPredicate) for u,v in edges: self.gr.del_edge((u,v)) self.gr.add_edge(edge=(u,v), label = 'dobj') v.features["debug"] =True self.types.add("DEBUG") return True return change
def _fix(self): # remove mark->that edges = find_edges( self, lambda u_v1: self.edge_label((u_v1[0], u_v1[1])) == "mark") for (u, v) in edges: if (len(self.neighbors(v)) == 0) and (len( v.text) == 1) and (v.text[0].word == "that"): self.del_node(v) return True # rcmod with no relation to father edges = find_edges( self, lambda u_v2: (self.edge_label( (u_v2[0], u_v2[1])) == "rcmod") and (not self.has_edge( (u_v2[1], u_v2[0])))) for u, v in edges: self.add_edge((v, u), label=ARG_LABEL) return True # prep collapse edges = find_edges( self, lambda u_v3: (self.edge_label( (u_v3[0], u_v3[1])) == "prep") and (len(self.neighbors(u_v3[1])) == 1) and ("pobj" in u_v3[1].neighbors())) if edges: for (u, v) in edges: pobj = v.neighbors()["pobj"][0] if not (self.has_edge((u, pobj))): w = v.text[0] u.surface_form += [w] self.add_edge((u, pobj), label="prep_" + w.word) self.del_node(v) # fix dependency collapse bugs edges = find_edges( self, lambda u_v4: (self.edge_label( (u_v4[0], u_v4[1])) == "pobj") and ("prep" not in u_v4[0].incidents())) for (u, v) in sorted(edges, key=lambda u_v5: u_v5[0].minIndex()): neighbors = u.neighbors() candidates = [ n for n in multi_get( neighbors, [rel for rel in neighbors if rel.startswith("prepc_")]) if len(self.neighbors(n)) == 0 ] candidates.sort(key=lambda n: n.minIndex()) if len(candidates) > 0: curToDel = candidates[0] rel = self.edge_label((u, curToDel)) self.del_edge((u, v)) self.add_edge((u, v), label=rel) self.del_node(curToDel) # change agent edges with "prep_by" edges = find_edges(self, lambda edge: (self.edge_label(edge) == "agent")) for edge in edges: self.del_edge(edge) self.add_edge(edge, label="prep_by") # #add xcomp inverse node # edges = find_edges(self, lambda (u,v):self.edge_label((u,v)) == "xcomp" and u.isPredicate and v.isPredicate) # for (u,v) in edges: # if not self.has_edge((v, u)): # self.add_edge((v,u), label=SOURCE_LABEL) # self.types.add("infinitives") # return True # if not multi_get(v.neighbors(),subject_dependencies): # rcmodParentIncidents = u.incidents().get("rcmod",[]) # if len(rcmodParentIncidents)==1: # subj = rcmodParentIncidents[0] # if not self.has_edge((v,subj)): # self.add_edge((v,subj),label=ARG_LABEL) return False
# fix dependency collapse bugs edges = find_edges(self, lambda (u, v):(self.edge_label((u, v)) == "pobj") and ("prep" not in u.incidents())) for (u, v) in sorted(edges,key=lambda((u,v)): u.minIndex()): neighbors = u.neighbors() candidates = [n for n in multi_get(neighbors, [rel for rel in neighbors if rel.startswith("prepc_")]) if len(self.neighbors(n)) == 0] candidates.sort(key=lambda n:n.minIndex()) if len(candidates) > 0: curToDel = candidates[0] rel = self.edge_label((u, curToDel)) self.del_edge((u, v)) self.add_edge((u, v), label=rel) self.del_node(curToDel) # change agent edges with "prep_by" edges = find_edges(self, lambda edge:(self.edge_label(edge) == "agent")) for edge in edges: self.del_edge(edge) self.add_edge(edge,label="prep_by") # #add xcomp inverse node # edges = find_edges(self, lambda (u,v):self.edge_label((u,v)) == "xcomp" and u.isPredicate and v.isPredicate) # for (u,v) in edges: # if not self.has_edge((v, u)): # self.add_edge((v,u), label=SOURCE_LABEL) # self.types.add("infinitives") # return True # if not multi_get(v.neighbors(),subject_dependencies): # rcmodParentIncidents = u.incidents().get("rcmod",[]) # if len(rcmodParentIncidents)==1: # subj = rcmodParentIncidents[0]
def remove_aux(self): edges = find_edges(self, lambda edge:self.edge_label(edge) in ignore_labels) for u, v in edges: if v.uid in self.nodesMap: u.original_text.extend(v.original_text) self.del_node(v)
def do_prop(self): # prenominal of definite edges = find_edges(self, lambda (u, v): self.edge_label( (u, v)) == "amod") for domain, mod in edges: if domain.pos( ) in determined_labels: # the np by itself is definite self.createPropRel(domain=domain, mod=mod) mod.features["top"] = True self.del_edge((domain, mod)) # copular on adjective or indefinite # and sameAs otherwise # find copular nodes = find_nodes( self, lambda n: len(n.text) == 1 and n.text[0].word in copular_verbs and n.isPredicate) for curNode in nodes: curNeighbours = curNode.neighbors() subjs = multi_get(curNeighbours, subject_dependencies) objs = multi_get(curNeighbours, clausal_complements) if not objs: objs = multi_get(curNeighbours, ["dep"]) others = [ n for n in self.neighbors(curNode) if n not in subjs + objs ] if (len(objs) > 0) and ( len(subjs) > 0): #and (not others) and (len(objs) == 1): others += objs[1:] if others: self.types.add("complicated BE") obj = objs[0] if len(objs) > 1: self.types.add("debug") for subj in subjs: if 'Lemma' in curNode.features: del (curNode.features['Lemma']) if (subj in self.neighbors(obj)): obj.features.update(curNode.features) else: if (not isDefinite(obj)) or (obj in curNeighbours.get( "acomp", [])): self.createPropRel(domain=subj, mod=obj) head = obj obj.surface_form += curNode.surface_form else: self.types.add("SameAs") self.del_edge((curNode, subj)) if self.has_edge((curNode, obj)): self.del_edge((curNode, obj)) # self.del_edges([(curNode, subj), (curNode, obj)]) copularNode = getCopular(self, curNode.text[0].index, features=curNode.features) copularNode.surface_form = curNode.surface_form self.add_edge((copularNode, subj), label=FIRST_ENTITY_LABEL) self.add_edge((copularNode, obj), label=SECOND_ENTITY_LABEL) head = copularNode head.features.update(curNode.features) for curFather in self.incidents(curNode): if not self.has_edge((curFather, head)): duplicateEdge(graph=self, orig=(curFather, curNode), new=(curFather, head)) for curOther in others: if not self.has_edge((obj, curOther)): duplicateEdge(graph=self, orig=(curNode, curOther), new=(head, curOther)) # erase "be" node self.del_node(curNode) # find appositions for subj, obj in find_edges( self, lambda edge: self.edge_label(edge) == "appos"): # duplicate relations for curFather in self.incidents(subj): curIndex = curFather.features.get("apposIndex", 0) + 1 # curLabel = "{0},{1}".format(curIndex,self.edge_label((curFather,subj))) curLabel = self.edge_label((curFather, subj)) self.del_edge((curFather, subj)) self.add_edge((curFather, subj), curLabel) self.add_edge((curFather, obj), curLabel) ls = curFather.features.get("dups", []) ls.append((subj, obj)) curFather.features["dups"] = ls curFather.features["apposIndex"] = curIndex if (not isDefinite(subj) and not isDefinite(obj)) or (obj in subj.neighbors().get( "acomp", [])): self.createPropRel(domain=subj, mod=obj) obj.features["top"] = True else: # add new node # TODO: subj here is a problem - should point to the comma or something self.types.add("SameAs") copularNode = getCopular(self, subj.text[0].index, features={}) copularNode.surface_form = [] self.add_edge((copularNode, subj), label=FIRST_ENTITY_LABEL) self.add_edge((copularNode, obj), label=SECOND_ENTITY_LABEL) self.del_edge((subj, obj))
def do_conj(self): edges = find_edges(self, lambda u_v10: self.edge_label( (u_v10[0], u_v10[1])).startswith( "conj_")) # and (not u.isPredicate) and (not v.isPredicate)) nodes = set([u for (u, _) in edges]) for conj1 in nodes: curStartIndex = conj1.minIndex() + 1 curNeighbours = conj1.neighbors() isModifier = (not bool([ father for father in self.incidents(conj1) if not self.is_aux_edge((father.uid, conj1.uid)) ])) and bool(self.incidents(conj1)) for rel in [ rel for rel in curNeighbours if rel.startswith("conj_") ]: marker = rel.split("conj_")[1] markerNode = newNode.Node( text=[Word(curStartIndex + 1, marker)], #TODO: how to find marker's index isPredicate=True, features={"conj": True}, gr=self) #decide how to connect it to the rest of the graph, based on its type if isModifier: duplicate_all_incidents(gr=self, source=conj1, target=markerNode) else: for father in self.incidents(conj1): for conj2 in curNeighbours[rel]: duplicateEdge(graph=self, orig=((father, conj1)), new=((father, conj2))) duplicateEdge(graph=self, orig=((father, conj1)), new=((father, markerNode))) if conj1.isPredicate: for neighbor in self.neighbors(conj1): if get_min_max_span(self, neighbor)[0] < curStartIndex: for conj2 in curNeighbours[rel]: if (self.edge_label( (conj1, neighbor)) == SOURCE_LABEL ) or (not self.is_aux_edge( (conj1.uid, neighbor.uid))): duplicateEdge(graph=self, orig=(conj1, neighbor), new=(conj2, neighbor)) # create the coordination construction, headed by the marker self.add_edge(edge=(markerNode, conj1), label=rel) for conj2 in curNeighbours[rel]: self.del_edge((conj1, conj2)) self.add_edge(edge=(markerNode, conj2), label=rel) if conj1.isPredicate: conj2.isPredicate = conj1.isPredicate conj1.surface_form = [ w for w in conj1.surface_form if (w not in conj2.surface_form) and ( w not in conj1.text) ] for w in conj1.text: if w not in conj1.surface_form: conj1.surface_form.append(w) if conj1.features.get("conjType", False): conj1.text = [ w for w in conj1.text if w.index not in conj1.features["conjType"][1] ] self.types.add(rel)
def do_prop(self): # prenominal of definite edges = find_edges(self, lambda (u, v):self.edge_label((u, v)) == "amod") for domain, mod in edges: if domain.pos() in determined_labels: # the np by itself is definite self.createPropRel(domain=domain, mod=mod) mod.features["top"] = True self.del_edge((domain, mod)) # copular on adjective or indefinite # and sameAs otherwise # find copular nodes = find_nodes(self, lambda n: len(n.text) == 1 and n.text[0].word in copular_verbs and n.isPredicate) for curNode in nodes: curNeighbours = curNode.neighbors() subjs = multi_get(curNeighbours, subject_dependencies) objs = multi_get(curNeighbours, clausal_complements) if not objs: objs = multi_get(curNeighbours,["dep"]) others = [n for n in self.neighbors(curNode) if n not in subjs + objs] if (len(objs)>0)and (len(subjs)>0): #and (not others) and (len(objs) == 1): others+=objs[1:] if others: self.types.add("complicated BE") obj = objs[0] if len(objs)>1: self.types.add("debug") for subj in subjs: if 'Lemma' in curNode.features: del(curNode.features['Lemma']) if (subj in self.neighbors(obj)): obj.features.update(curNode.features) else: if (not isDefinite(obj)) or (obj in curNeighbours.get("acomp", [])): self.createPropRel(domain=subj, mod=obj) head = obj obj.surface_form += curNode.surface_form else: self.types.add("SameAs") self.del_edge((curNode, subj)) if self.has_edge((curNode,obj)): self.del_edge((curNode, obj)) # self.del_edges([(curNode, subj), (curNode, obj)]) copularNode = getCopular(self, curNode.text[0].index, features=curNode.features) copularNode.surface_form = curNode.surface_form self.add_edge((copularNode, subj), label=FIRST_ENTITY_LABEL) self.add_edge((copularNode, obj), label=SECOND_ENTITY_LABEL) head = copularNode head.features.update(curNode.features) for curFather in self.incidents(curNode): if not self.has_edge((curFather, head)): duplicateEdge(graph=self, orig=(curFather, curNode), new=(curFather, head)) for curOther in others: if not self.has_edge((obj, curOther)): duplicateEdge(graph=self, orig=(curNode, curOther), new=(head, curOther)) # erase "be" node self.del_node(curNode) # find appositions for subj, obj in find_edges(self, lambda edge:self.edge_label(edge) == "appos"): # duplicate relations for curFather in self.incidents(subj): curIndex = curFather.features.get("apposIndex", 0) + 1 # curLabel = "{0},{1}".format(curIndex,self.edge_label((curFather,subj))) curLabel = self.edge_label((curFather, subj)) self.del_edge((curFather, subj)) self.add_edge((curFather, subj), curLabel) self.add_edge((curFather, obj), curLabel) ls = curFather.features.get("dups", []) ls.append((subj, obj)) curFather.features["dups"] = ls curFather.features["apposIndex"] = curIndex if (not isDefinite(subj) and not isDefinite(obj)) or (obj in subj.neighbors().get("acomp", [])): self.createPropRel(domain=subj, mod=obj) obj.features["top"] = True else: # add new node # TODO: subj here is a problem - should point to the comma or something self.types.add("SameAs") copularNode = getCopular(self, subj.text[0].index, features={}) copularNode.surface_form = [] self.add_edge((copularNode, subj), label=FIRST_ENTITY_LABEL) self.add_edge((copularNode, obj), label=SECOND_ENTITY_LABEL) self.del_edge((subj, obj))