def findiobjextract(patt, node, root): whole, head = "", "" parent = treeops.getparentphrase(node, root) current = parent while current.getnext() is not None: current = current.getnext() text = treeops.text(current) if "advp" in text or "qual" in text or "timex" in text or "pp" in text or "vp" in text or text in "obj" or text in "np" or text in "nps" or text in "ap" or text in "aps": continue elif "iobj" in text: whole, head, excase = treeops.getextractphrase(current, root) break else: break if not head: current = parent while current.getprevious() is not None: current = current.getprevious() text = treeops.text(current) if "advp" in text or "pp" in text or "qual" in text or "timex" in text or "vp" in text or text in "obj" or text in "np" or text in "nps" or text in "ap" or text in "aps": continue elif "iobj" in text: whole, head, excase = treeops.getextractphrase(current, root) break else: break if not head: return "", "" pattcase = helpers.getpatterncase(patt) if pattcase not in excase and excase not in pattcase: return "", "" else: return whole, head
def findsubjextract(patt, node, root): whole, head = "", "" parent = treeops.getparentphrase(node, root) current = parent while current.getprevious() is not None: current = current.getprevious() text = treeops.text(current) if "advp" in text or "pp" in text or "vp" in text or "qual" in text or "timex" in text or text in "np" or text in "nps" or text in "ap" or text in "aps": continue elif "subj" in text: whole, head, excase = treeops.getextractphrase(current, root) break elif "scp" in text: # =Maðurinn= sem *beit* hestinn newcurrent = current.getprevious() ntext = treeops.text(current) if "advp" in ntext or "pp" in ntext or "qual" in ntext or "timex" in ntext or ntext in "np" or ntext in "nps" or ntext in "ap" or ntext in "aps": continue elif "subj" in ntext: whole, head, excase = treeops.getextractphrase(current, root) break else: break if not head: current = parent while current.getnext() is not None: current = current.getnext() text = treeops.text(current) if "advp" in text or "pp" in text or "qual" in text or "timex" in text or "vp" in text or text in "np" or text in "nps" or text in "ap" or text in "aps": continue elif "subj" in text: whole, head, excase = treeops.getextractphrase(current, root) break else: break if not head: # Looking for AP-COMP acting as subj current = parent while current.getprevious() is not None: current = current.getprevious() text = treeops.text(current) if "advp" in text or "pp" in text or "vp" in text or "qual" in text or "timex" in text or text in "np" or text in "nps": continue elif text in "ap-comp" or text in "aps-comp": whole, head, excase = treeops.getextractphrase(current, root) break elif "scp" in text: # =Maðurinn= sem *beit* hestinn newcurrent = current.getprevious() ntext = treeops.text(current) if "advp" in ntext or "pp" in ntext or "qual" in ntext or "timex" in ntext or ntext in "np" or ntext in "nps": continue elif ntext in "ap-comp" or ntext in "aps-comp": whole, head, excase = treeops.getextractphrase( current, root) break else: break if head: print "yay, ap-comp acting as np-subj! {}".format(whole) if not head: return "", "" pattcase = helpers.getpatterncase(patt) if pattcase not in excase and excase not in pattcase: return "", "" else: return whole, head
def getyestertrigger(node, root): prep = "í" trigger, triggertype = "", "" parent = treeops.getparentphrase(node, root) current = parent while current.getprevious() is not None: current = current.getprevious() text = treeops.text(current) if "np-qual" in text or text in "np" or text in "nps" or text in "ap" or text in "aps" or "timex" in text: continue elif "np" in text: triggertype = "noun pp|" + prep trigger = treeops.gettriggernoun(current) break elif "vp" in text: if "vpp-comp" in text: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(current) break elif "vpi" in text or "vps" in text or "vpg" in text: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) break else: # This verb might be the only one, maybe there's something else to check triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) loopcurrent = current while loopcurrent.getprevious() is not None: loopcurrent = loopcurrent.getprevious() ltext = treeops.text(loopcurrent) if "advp" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps" or "np-qual" in ltext or "timex" in ltext: continue elif "vpp-comp" in ltext: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "pp" in ltext: continue else: break #this verb is the only one we have. break break elif "pp" in text: continue else: break #nothing found here if not trigger: current = parent while current.getnext() is not None: current = current.getnext() text = treeops.text(current) if "np-qual" in text or "timex" in text or text in "np" or text in "nps" or text in "ap" or text in "aps" or "np-timex" in text: continue elif "np" in text: triggertype = "noun pp|" + prep trigger = treeops.gettriggernoun(current) break elif "vp" in text: if "vpp-comp" in text: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(current) break elif "vpi" in text or "vps" in text or "vpg" in text: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) break else: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) loopcurrent = current while loopcurrent.getnext() is not None: loopcurrent = loopcurrent.getnext() ltext = treeops.text(loopcurrent) if "advp" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps" or "np-qual" in ltext or "timex" in ltext: continue elif "vpp-comp" in ltext: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "pp" in ltext: continue else: break break elif "pp" in text: continue else: break if not trigger: return "", "" return trigger, triggertype
def findppextract(patt, node, root): whole, head, case = "", "", "" parts, prep = [], [] parent = treeops.getparentphrase(node, root) current = parent while current.getnext() is not None: current = current.getnext() text = treeops.text(current) if treeops.yester( current ): # Deals with "í gær" being categorized as ADVP, not PP pp = "í" whole = "gær" head = "gær" case = "nom" pattcase = helpers.getpatterncase(patt) if pattcase not in case and case not in pattcase: #Wrong pp phrase head, case = "", "" continue pattprep = helpers.getpatternprep(patt) if pp not in pattprep or pattprep not in pp: pp, head, case = "", "", "" continue break elif "advp" in text: for word in current.findall(".//WORD"): mytag = treeops.tag(word) if "a" in mytag[0]: prep.append(treeops.text(word)) continue nowcurrent = current while nowcurrent.getnext() is not None: nowcurrent = nowcurrent.getnext() nowtext = treeops.text(nowcurrent) nowtext = nowtext.replace("[", "") if nowtext in "np" or nowtext in "nps" or "timex" in nowtext: #Want to treat as a pp phrase for word in nowcurrent.findall(".//WORD"): parts.append(treeops.lemma(word)) if not head: if treeops.isnoun(word): head = treeops.lemma(word) case = treeops.headcase(treeops.tag(word)) continue else: break if not parts or not prep: #Shouldn't be treated as pp phrase head, case, parts, prep = "", "", [], [] continue pattcase = helpers.getpatterncase(patt) if pattcase not in case and case not in pattcase: #Wrong pp phrase head, case, parts, prep = "", "", [], [] continue pp = " ".join(prep) pattprep = helpers.getpatternprep(patt) if pp not in pattprep or pattprep not in pp: pp, head, case, parts, prep = "", "", "", [], [] continue whole = " ".join(parts) break elif "qual" in text or text in "np" or text in "nps" or text in "ap" or text in "aps" or "timex" in text: continue elif "pp" in text: for word in current.findall(".//WORD"): mytag = treeops.tag(word) if "a" in mytag[0]: prep.append(treeops.text(word)) continue else: parts.append(treeops.lemma(word)) if not head: if treeops.isnoun(word): head = treeops.lemma(word) case = treeops.headcase(treeops.tag(word)) if not parts or not prep: #Weird pp phrase, continue search head, case, parts, prep = "", "", [], [] continue pattcase = helpers.getpatterncase(patt) if pattcase not in case and case not in pattcase: #Wrong pp phrase head, case, parts, prep = "", "", [], [] continue pp = " ".join(prep) pattprep = helpers.getpatternprep(patt) if pp not in pattprep or pattprep not in pp: #wrong preposition phrase pp, head, case, parts, prep = "", "", "", [], [] continue if current.getnext( ) is not None and not head: # If I haven't found the head yet mynext = current.getnext() mytext = treeops.text(mynext) if "np" in mytext: for word in mynext.findall(".//WORD"): parts.append(treeops.lemma(word)) if treeops.isnoun(word): head = treeops.lemma(word) case = treeops.headcase(treeops.tag(word)) pp = " ".join(prep) whole = " ".join(parts) break else: break if not head: return "", "" pp = " ".join(prep) whole = " ".join(parts) break else: current = parent if current.getprevious() is not None: current = current.getprevious() text = treeops.text(current) if "scp" in text: if current.getprevious() is not None: current = current.getprevious() text = treeops.text(current) if "pp" in text: for word in current.findall(".//WORD"): mytag = treeops.tag(word) if "a" in mytag[0]: prep.append(treeops.text(word)) continue else: parts.append(treeops.lemma(word)) if not head: if treeops.isnoun(word): head = treeops.lemma(word) case = treeops.headcase( treeops.tag(word)) if not parts or not prep: return "", "" pattcase = helpers.getpatterncase(patt) if pattcase not in case and case not in pattcase: #Wrong pp phrase return "", "" pp = " ".join(prep) pattprep = helpers.getpatternprep(patt) if pp not in pattprep or pattprep not in pp: return "", "" if current.getnext( ) is not None and not head: # If I haven't found the head yet mynext = current.getnext() mytext = treeops.text(mynext) if "np" in text: for word in current.findall(".//WORD"): parts.append(treeops.lemma(word)) if treeops.isnoun(word): head = treeops.lemma(word) case = treeops.headcase( treeops.tag(word)) break else: break whole = " ".join(parts) break else: break else: break else: break else: break if not head: current = parent while current.getprevious() is not None: current = current.getprevious() text = treeops.text(current) text = text.replace("[", "") if treeops.yester(current): pp = "í" whole = "gær" head = "gær" case = "nom" pattcase = helpers.getpatterncase(patt) if pattcase not in case and case not in pattcase: #Wrong pp phrase head, case = "", "" continue pattprep = helpers.getpatternprep(patt) if pp not in pattprep or pattprep not in pp: pp, head, case = "", "", "" continue break elif "advp" in text: for word in current.findall(".//WORD"): mytag = treeops.tag(word) if "a" in mytag[0]: prep.append(treeops.text(word)) continue nowcurrent = current while nowcurrent.getnext() is not None: nowcurrent = nowcurrent.getnext() nowtext = treeops.text(nowcurrent) nowtext = nowtext.replace("[", "") if nowtext in "np" or nowtext in "nps" or "timex" in nowtext: #Want to treat as a pp phrase for word in nowcurrent.findall(".//WORD"): parts.append(treeops.lemma(word)) if not head: if treeops.isnoun(word): head = treeops.lemma(word) case = treeops.headcase(treeops.tag(word)) continue else: break if not parts or not prep: #Shouldn't be treated as pp phrase head, case, parts, prep = "", "", [], [] continue pattcase = helpers.getpatterncase(patt) if pattcase not in case and case not in pattcase: #Wrong pp phrase head, case, parts, prep = "", "", [], [] continue pp = " ".join(prep) pattprep = helpers.getpatternprep(patt) if pp not in pattprep or pattprep not in pp: pp, head, case, parts, prep = "", "", "", [], [] continue whole = " ".join(parts) break elif "qual" in text or text in "np" or text in "nps" or text in "ap" or text in "aps" or "timex" in text: continue elif "pp" in text: for word in current.findall(".//WORD"): mytag = treeops.tag(word) if "a" in mytag[0]: prep.append(treeops.text(word)) continue else: parts.append(treeops.lemma(word)) if not head: if treeops.isnoun(word): head = treeops.lemma(word) case = treeops.headcase(treeops.tag(word)) if not parts or not prep: #Weird pp phrase, continue search head, case, prep = "", "", [] continue pattcase = helpers.getpatterncase(patt) if pattcase not in case and case not in pattcase: #Wrong pp phrase head, case, parts, prep = "", "", [], [] continue pp = " ".join(prep) pattprep = helpers.getpatternprep(patt) if pp not in pattprep or pattprep not in pp: #wrong preposition phrase pp, head, case, parts, prep = "", "", "", [], [] continue whole = " ".join(parts) break if not head: return "", "" return whole, head
def getpreptrigger(node, root): parts = [] #To account for multi word prepositions trigger, triggertype = "", "" parent = treeops.getparentphrase(node, root) for word in parent.findall(".//WORD"): newword = treeops.text(word) mytag = treeops.tag(word) if "a" in mytag[0]: #Found preposition parts.append(newword) if not parts: #No preposition found, have unmarked np to find correct trigger for. if parent.getprevious() is not None: previousnode = parent.getprevious() ptext = treeops.text(previousnode) if "pp" in ptext or "advp" in ptext: found = False #Checking if I find a noun in the phrase for aword in previousnode.findall(".//WORD"): newword = treeops.text(aword) mytag = treeops.tag(aword) if "a" in mytag[0]: #Found preposition parts.append(newword) elif "n" in mytag[0]: #Only applies to PP phrases found = True if found: return "", "" elif "iobj" in ptext: trigger, triggertype = getiobjtrigger(previousnode, root) return trigger, triggertype elif "obj" in ptext: trigger, triggertype = getobjtrigger(previousnode, root) return trigger, triggertype elif "subj" in ptext: trigger, triggertype = getsubjtrigger(previousnode, root) return trigger, triggertype elif "ap" in ptext: if previousnode.getprevious() is not None: moreprevious = previousnode.getprevious() mtext = treeops.text(moreprevious) found = False if not "np" in mtext: return "", "" else: for every in moreprevious.findall(".//WORD"): etag = treeops.tag(every) if "a" in etag[0]: getpreptrigger(moreprevious, root) elif "n" in etag[0]: found = True else: continue if not found: #No noun in NP, can add ap and unmarked np to it if "subj" in mtext: trigger, triggertype = getsubjtrigger( moreprevious, root) return trigger, triggertype elif "iobj" in mtext: trigger, triggertype = getiobjtrigger( moreprevious, root) return trigger, triggertype elif "obj" in mtext: trigger, triggertype = getobjtrigger( moreprevious, root) return trigger, triggertype else: return "", "" else: return "", "" else: return "", "" else: return "", "" if len(parts) == 1: prep = parts[0] else: prep = " ".join(parts) # Getting the trigger current = parent while current.getprevious() is not None: current = current.getprevious() text = treeops.text(current) if "np-qual" in text or "timex" in text or "advp" in text or text in "np" or text in "nps" or text in "ap" or text in "aps": continue elif "np" in text: triggertype = "noun pp|" + prep trigger = treeops.gettriggernoun(current) break elif "vp" in text: if "vpp-comp" in text: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(current) break elif "vpi" in text or "vps" in text or "vpg" in text: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) break else: # This verb might be the only one, maybe there's something else to check triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) loopcurrent = current while loopcurrent.getprevious() is not None: loopcurrent = loopcurrent.getprevious() ltext = treeops.text(loopcurrent) if "advp" in ltext or "np-qual" in ltext or "timex" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps": continue elif "vpp-comp" in ltext: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "pp" in ltext: continue else: break #this verb is the only one we have. break break elif "pp" in text: continue else: break #nothing found here if not trigger: # Looking after the PP current = parent while current.getnext() is not None: current = current.getnext() text = treeops.text(current) if "np-qual" in text or "nps-qual" in text or "timex" in text or "advp" in text or text in "np" or text in "nps" or text in "ap" or text in "aps": #Change: advp and unmarked NPs added continue elif "np" in text: triggertype = "noun pp|" + prep trigger = treeops.gettriggernoun(current) break elif "vp" in text: if "vpp-comp" in text: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(current) break elif "vpi" in text or "vps" in text or "vpg" in text: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) break else: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) loopcurrent = current while loopcurrent.getnext() is not None: loopcurrent = loopcurrent.getnext() ltext = treeops.text(loopcurrent) if "advp" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps" or "np-qual" in ltext or "timex" in ltext: continue elif "vpp-comp" in ltext: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "pp" in ltext: continue else: break break elif "scp" in text: newnext = current.getnext() newtext = treeops.text(newnext) if "subj" in newtext: newnext = newnext.getnext() newtext = treeops.text(newnext) if "vp" in newtext: if "vpp-comp" in newtext: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(current) break elif "vpi" in newtext or "vps" in newtext or "vpg" in newtext: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) break else: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(current) loopcurrent = current while loopcurrent.getnext() is not None: loopcurrent = loopcurrent.getnext() ltext = treeops.text(loopcurrent) if "advp" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps" or "np-qual" in ltext or "timex" in ltext: continue elif "vpp-comp" in ltext: triggertype = "passive pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext: triggertype = "active pp|" + prep trigger = treeops.gettriggerverb(loopcurrent) break elif "pp" in ltext: continue else: break elif "pp" in text: continue else: break if not trigger: return "", "" return trigger, triggertype