Beispiel #1
0
def findiobjextract(patt, node, root):
    whole, head = "", ""
    parent = treeops.getparentphrase(node, root)
    current = parent
    while current.getnext() is not None:
        current = current.getnext()
        text = treeops.text(current)
        if "advp" in text or "qual" in text or "timex" in text or "pp" in text or "vp" in text or text in "obj" or text in "np" or text in "nps" or text in "ap" or text in "aps":
            continue
        elif "iobj" in text:
            whole, head, excase = treeops.getextractphrase(current, root)
            break
        else:
            break
    if not head:
        current = parent
        while current.getprevious() is not None:
            current = current.getprevious()
            text = treeops.text(current)
            if "advp" in text or "pp" in text or "qual" in text or "timex" in text or "vp" in text or text in "obj" or text in "np" or text in "nps" or text in "ap" or text in "aps":
                continue
            elif "iobj" in text:
                whole, head, excase = treeops.getextractphrase(current, root)
                break
            else:
                break
    if not head:
        return "", ""
    pattcase = helpers.getpatterncase(patt)
    if pattcase not in excase and excase not in pattcase:
        return "", ""
    else:
        return whole, head
Beispiel #2
0
def findsubjextract(patt, node, root):
    whole, head = "", ""
    parent = treeops.getparentphrase(node, root)
    current = parent
    while current.getprevious() is not None:
        current = current.getprevious()
        text = treeops.text(current)
        if "advp" in text or "pp" in text or "vp" in text or "qual" in text or "timex" in text or text in "np" or text in "nps" or text in "ap" or text in "aps":
            continue
        elif "subj" in text:
            whole, head, excase = treeops.getextractphrase(current, root)
            break
        elif "scp" in text:  # =Maðurinn= sem *beit* hestinn
            newcurrent = current.getprevious()
            ntext = treeops.text(current)
            if "advp" in ntext or "pp" in ntext or "qual" in ntext or "timex" in ntext or ntext in "np" or ntext in "nps" or ntext in "ap" or ntext in "aps":
                continue
            elif "subj" in ntext:
                whole, head, excase = treeops.getextractphrase(current, root)
                break
        else:
            break
    if not head:
        current = parent
        while current.getnext() is not None:
            current = current.getnext()
            text = treeops.text(current)
            if "advp" in text or "pp" in text or "qual" in text or "timex" in text or "vp" in text or text in "np" or text in "nps" or text in "ap" or text in "aps":
                continue
            elif "subj" in text:
                whole, head, excase = treeops.getextractphrase(current, root)
                break
            else:
                break
    if not head:  # Looking  for AP-COMP acting as subj
        current = parent
        while current.getprevious() is not None:
            current = current.getprevious()
            text = treeops.text(current)
            if "advp" in text or "pp" in text or "vp" in text or "qual" in text or "timex" in text or text in "np" or text in "nps":
                continue
            elif text in "ap-comp" or text in "aps-comp":
                whole, head, excase = treeops.getextractphrase(current, root)
                break
            elif "scp" in text:  # =Maðurinn= sem *beit* hestinn
                newcurrent = current.getprevious()
                ntext = treeops.text(current)
                if "advp" in ntext or "pp" in ntext or "qual" in ntext or "timex" in ntext or ntext in "np" or ntext in "nps":
                    continue
                elif ntext in "ap-comp" or ntext in "aps-comp":
                    whole, head, excase = treeops.getextractphrase(
                        current, root)
                    break
            else:
                break
        if head:
            print "yay, ap-comp acting as np-subj! {}".format(whole)
    if not head:
        return "", ""
    pattcase = helpers.getpatterncase(patt)
    if pattcase not in excase and excase not in pattcase:
        return "", ""
    else:
        return whole, head
Beispiel #3
0
def getyestertrigger(node, root):
    prep = "í"
    trigger, triggertype = "", ""
    parent = treeops.getparentphrase(node, root)
    current = parent
    while current.getprevious() is not None:
        current = current.getprevious()
        text = treeops.text(current)
        if "np-qual" in text or text in "np" or text in "nps" or text in "ap" or text in "aps" or "timex" in text:
            continue
        elif "np" in text:
            triggertype = "noun pp|" + prep
            trigger = treeops.gettriggernoun(current)
            break
        elif "vp" in text:
            if "vpp-comp" in text:
                triggertype = "passive pp|" + prep
                trigger = treeops.gettriggerverb(current)
                break
            elif "vpi" in text or "vps" in text or "vpg" in text:
                triggertype = "active pp|" + prep
                trigger = treeops.gettriggerverb(current)
                break
            else:  # This verb might be the only one, maybe there's something else to check
                triggertype = "active pp|" + prep
                trigger = treeops.gettriggerverb(current)
                loopcurrent = current
                while loopcurrent.getprevious() is not None:
                    loopcurrent = loopcurrent.getprevious()
                    ltext = treeops.text(loopcurrent)
                    if "advp" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps" or "np-qual" in ltext or "timex" in ltext:
                        continue
                    elif "vpp-comp" in ltext:
                        triggertype = "passive pp|" + prep
                        trigger = treeops.gettriggerverb(loopcurrent)
                        break
                    elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext:
                        triggertype = "active pp|" + prep
                        trigger = treeops.gettriggerverb(loopcurrent)
                        break
                    elif "pp" in ltext:
                        continue
                    else:
                        break  #this verb is the only one we have.
                    break
                break
        elif "pp" in text:
            continue
        else:
            break  #nothing found here
    if not trigger:
        current = parent
        while current.getnext() is not None:
            current = current.getnext()
            text = treeops.text(current)
            if "np-qual" in text or "timex" in text or text in "np" or text in "nps" or text in "ap" or text in "aps" or "np-timex" in text:
                continue
            elif "np" in text:
                triggertype = "noun pp|" + prep
                trigger = treeops.gettriggernoun(current)
                break
            elif "vp" in text:
                if "vpp-comp" in text:
                    triggertype = "passive pp|" + prep
                    trigger = treeops.gettriggerverb(current)
                    break
                elif "vpi" in text or "vps" in text or "vpg" in text:
                    triggertype = "active pp|" + prep
                    trigger = treeops.gettriggerverb(current)
                    break
                else:
                    triggertype = "active pp|" + prep
                    trigger = treeops.gettriggerverb(current)
                    loopcurrent = current
                    while loopcurrent.getnext() is not None:
                        loopcurrent = loopcurrent.getnext()
                        ltext = treeops.text(loopcurrent)
                        if "advp" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps" or "np-qual" in ltext or "timex" in ltext:
                            continue
                        elif "vpp-comp" in ltext:
                            triggertype = "passive pp|" + prep
                            trigger = treeops.gettriggerverb(loopcurrent)
                            break
                        elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext:
                            triggertype = "active pp|" + prep
                            trigger = treeops.gettriggerverb(loopcurrent)
                            break
                        elif "pp" in ltext:
                            continue
                        else:
                            break
                    break
            elif "pp" in text:
                continue
            else:
                break
    if not trigger:
        return "", ""
    return trigger, triggertype
Beispiel #4
0
def findppextract(patt, node, root):
    whole, head, case = "", "", ""
    parts, prep = [], []
    parent = treeops.getparentphrase(node, root)
    current = parent
    while current.getnext() is not None:
        current = current.getnext()
        text = treeops.text(current)
        if treeops.yester(
                current
        ):  # Deals with "í gær" being categorized as ADVP, not PP
            pp = "í"
            whole = "gær"
            head = "gær"
            case = "nom"
            pattcase = helpers.getpatterncase(patt)
            if pattcase not in case and case not in pattcase:  #Wrong pp phrase
                head, case = "", ""
                continue
            pattprep = helpers.getpatternprep(patt)
            if pp not in pattprep or pattprep not in pp:
                pp, head, case = "", "", ""
                continue
            break
        elif "advp" in text:
            for word in current.findall(".//WORD"):
                mytag = treeops.tag(word)
                if "a" in mytag[0]:
                    prep.append(treeops.text(word))
                    continue
            nowcurrent = current
            while nowcurrent.getnext() is not None:
                nowcurrent = nowcurrent.getnext()
                nowtext = treeops.text(nowcurrent)
                nowtext = nowtext.replace("[", "")
                if nowtext in "np" or nowtext in "nps" or "timex" in nowtext:  #Want to treat as a pp phrase
                    for word in nowcurrent.findall(".//WORD"):
                        parts.append(treeops.lemma(word))
                        if not head:
                            if treeops.isnoun(word):
                                head = treeops.lemma(word)
                                case = treeops.headcase(treeops.tag(word))
                    continue
                else:
                    break
            if not parts or not prep:  #Shouldn't be treated as pp phrase
                head, case, parts, prep = "", "", [], []
                continue
            pattcase = helpers.getpatterncase(patt)
            if pattcase not in case and case not in pattcase:  #Wrong pp phrase
                head, case, parts, prep = "", "", [], []
                continue
            pp = " ".join(prep)
            pattprep = helpers.getpatternprep(patt)
            if pp not in pattprep or pattprep not in pp:
                pp, head, case, parts, prep = "", "", "", [], []
                continue
            whole = " ".join(parts)
            break
        elif "qual" in text or text in "np" or text in "nps" or text in "ap" or text in "aps" or "timex" in text:
            continue
        elif "pp" in text:
            for word in current.findall(".//WORD"):
                mytag = treeops.tag(word)
                if "a" in mytag[0]:
                    prep.append(treeops.text(word))
                    continue
                else:
                    parts.append(treeops.lemma(word))
                    if not head:
                        if treeops.isnoun(word):
                            head = treeops.lemma(word)
                            case = treeops.headcase(treeops.tag(word))
            if not parts or not prep:  #Weird pp phrase, continue search
                head, case, parts, prep = "", "", [], []
                continue
            pattcase = helpers.getpatterncase(patt)
            if pattcase not in case and case not in pattcase:  #Wrong pp phrase
                head, case, parts, prep = "", "", [], []
                continue
            pp = " ".join(prep)
            pattprep = helpers.getpatternprep(patt)
            if pp not in pattprep or pattprep not in pp:  #wrong preposition phrase
                pp, head, case, parts, prep = "", "", "", [], []
                continue
            if current.getnext(
            ) is not None and not head:  # If I haven't found the head yet
                mynext = current.getnext()
                mytext = treeops.text(mynext)
                if "np" in mytext:
                    for word in mynext.findall(".//WORD"):
                        parts.append(treeops.lemma(word))
                        if treeops.isnoun(word):
                            head = treeops.lemma(word)
                            case = treeops.headcase(treeops.tag(word))
                    pp = " ".join(prep)
                    whole = " ".join(parts)
                    break
                else:
                    break
            if not head:
                return "", ""
            pp = " ".join(prep)
            whole = " ".join(parts)
            break
        else:
            current = parent
            if current.getprevious() is not None:
                current = current.getprevious()
                text = treeops.text(current)
                if "scp" in text:
                    if current.getprevious() is not None:
                        current = current.getprevious()
                        text = treeops.text(current)
                        if "pp" in text:
                            for word in current.findall(".//WORD"):
                                mytag = treeops.tag(word)
                                if "a" in mytag[0]:
                                    prep.append(treeops.text(word))
                                    continue
                                else:
                                    parts.append(treeops.lemma(word))
                                    if not head:
                                        if treeops.isnoun(word):
                                            head = treeops.lemma(word)
                                            case = treeops.headcase(
                                                treeops.tag(word))
                            if not parts or not prep:
                                return "", ""
                            pattcase = helpers.getpatterncase(patt)
                            if pattcase not in case and case not in pattcase:  #Wrong pp phrase
                                return "", ""
                            pp = " ".join(prep)
                            pattprep = helpers.getpatternprep(patt)
                            if pp not in pattprep or pattprep not in pp:
                                return "", ""
                            if current.getnext(
                            ) is not None and not head:  # If I haven't found the head yet
                                mynext = current.getnext()
                                mytext = treeops.text(mynext)
                                if "np" in text:
                                    for word in current.findall(".//WORD"):
                                        parts.append(treeops.lemma(word))
                                        if treeops.isnoun(word):
                                            head = treeops.lemma(word)
                                            case = treeops.headcase(
                                                treeops.tag(word))
                                    break
                                else:
                                    break
                            whole = " ".join(parts)
                            break
                        else:
                            break
                    else:
                        break
                else:
                    break
            else:
                break
    if not head:
        current = parent
        while current.getprevious() is not None:
            current = current.getprevious()
            text = treeops.text(current)
            text = text.replace("[", "")
            if treeops.yester(current):
                pp = "í"
                whole = "gær"
                head = "gær"
                case = "nom"
                pattcase = helpers.getpatterncase(patt)
                if pattcase not in case and case not in pattcase:  #Wrong pp phrase
                    head, case = "", ""
                    continue
                pattprep = helpers.getpatternprep(patt)
                if pp not in pattprep or pattprep not in pp:
                    pp, head, case = "", "", ""
                    continue
                break
            elif "advp" in text:
                for word in current.findall(".//WORD"):
                    mytag = treeops.tag(word)
                    if "a" in mytag[0]:
                        prep.append(treeops.text(word))
                        continue
                nowcurrent = current
                while nowcurrent.getnext() is not None:
                    nowcurrent = nowcurrent.getnext()
                    nowtext = treeops.text(nowcurrent)
                    nowtext = nowtext.replace("[", "")
                    if nowtext in "np" or nowtext in "nps" or "timex" in nowtext:  #Want to treat as a pp phrase
                        for word in nowcurrent.findall(".//WORD"):
                            parts.append(treeops.lemma(word))
                            if not head:
                                if treeops.isnoun(word):
                                    head = treeops.lemma(word)
                                    case = treeops.headcase(treeops.tag(word))
                        continue
                    else:
                        break
                if not parts or not prep:  #Shouldn't be treated as pp phrase
                    head, case, parts, prep = "", "", [], []
                    continue
                pattcase = helpers.getpatterncase(patt)
                if pattcase not in case and case not in pattcase:  #Wrong pp phrase
                    head, case, parts, prep = "", "", [], []
                    continue
                pp = " ".join(prep)
                pattprep = helpers.getpatternprep(patt)
                if pp not in pattprep or pattprep not in pp:
                    pp, head, case, parts, prep = "", "", "", [], []
                    continue
                whole = " ".join(parts)
                break
            elif "qual" in text or text in "np" or text in "nps" or text in "ap" or text in "aps" or "timex" in text:
                continue
            elif "pp" in text:
                for word in current.findall(".//WORD"):
                    mytag = treeops.tag(word)
                    if "a" in mytag[0]:
                        prep.append(treeops.text(word))
                        continue
                    else:
                        parts.append(treeops.lemma(word))
                        if not head:
                            if treeops.isnoun(word):
                                head = treeops.lemma(word)
                                case = treeops.headcase(treeops.tag(word))
                if not parts or not prep:  #Weird pp phrase, continue search
                    head, case, prep = "", "", []
                    continue
                pattcase = helpers.getpatterncase(patt)
                if pattcase not in case and case not in pattcase:  #Wrong pp phrase
                    head, case, parts, prep = "", "", [], []
                    continue
                pp = " ".join(prep)
                pattprep = helpers.getpatternprep(patt)
                if pp not in pattprep or pattprep not in pp:  #wrong preposition phrase
                    pp, head, case, parts, prep = "", "", "", [], []
                    continue
                whole = " ".join(parts)
                break
    if not head:
        return "", ""
    return whole, head
Beispiel #5
0
def getpreptrigger(node, root):
    parts = []  #To account for multi word prepositions
    trigger, triggertype = "", ""
    parent = treeops.getparentphrase(node, root)
    for word in parent.findall(".//WORD"):
        newword = treeops.text(word)
        mytag = treeops.tag(word)
        if "a" in mytag[0]:  #Found preposition
            parts.append(newword)
    if not parts:  #No preposition found, have unmarked np to find correct trigger for.
        if parent.getprevious() is not None:
            previousnode = parent.getprevious()
            ptext = treeops.text(previousnode)
            if "pp" in ptext or "advp" in ptext:
                found = False  #Checking if I find a noun in the phrase
                for aword in previousnode.findall(".//WORD"):
                    newword = treeops.text(aword)
                    mytag = treeops.tag(aword)
                    if "a" in mytag[0]:  #Found preposition
                        parts.append(newword)
                    elif "n" in mytag[0]:  #Only applies to PP phrases
                        found = True
                if found:
                    return "", ""
            elif "iobj" in ptext:
                trigger, triggertype = getiobjtrigger(previousnode, root)
                return trigger, triggertype
            elif "obj" in ptext:
                trigger, triggertype = getobjtrigger(previousnode, root)
                return trigger, triggertype
            elif "subj" in ptext:
                trigger, triggertype = getsubjtrigger(previousnode, root)
                return trigger, triggertype
            elif "ap" in ptext:
                if previousnode.getprevious() is not None:
                    moreprevious = previousnode.getprevious()
                    mtext = treeops.text(moreprevious)
                    found = False
                    if not "np" in mtext:
                        return "", ""
                    else:
                        for every in moreprevious.findall(".//WORD"):
                            etag = treeops.tag(every)
                            if "a" in etag[0]:
                                getpreptrigger(moreprevious, root)
                            elif "n" in etag[0]:
                                found = True
                            else:
                                continue
                        if not found:  #No noun in NP, can add ap and unmarked np to it
                            if "subj" in mtext:
                                trigger, triggertype = getsubjtrigger(
                                    moreprevious, root)
                                return trigger, triggertype
                            elif "iobj" in mtext:
                                trigger, triggertype = getiobjtrigger(
                                    moreprevious, root)
                                return trigger, triggertype
                            elif "obj" in mtext:
                                trigger, triggertype = getobjtrigger(
                                    moreprevious, root)
                                return trigger, triggertype
                        else:
                            return "", ""
                else:
                    return "", ""
            else:
                return "", ""
        else:
            return "", ""
    if len(parts) == 1:
        prep = parts[0]
    else:
        prep = " ".join(parts)
    # Getting the trigger
    current = parent
    while current.getprevious() is not None:
        current = current.getprevious()
        text = treeops.text(current)
        if "np-qual" in text or "timex" in text or "advp" in text or text in "np" or text in "nps" or text in "ap" or text in "aps":
            continue
        elif "np" in text:
            triggertype = "noun pp|" + prep
            trigger = treeops.gettriggernoun(current)
            break
        elif "vp" in text:
            if "vpp-comp" in text:
                triggertype = "passive pp|" + prep
                trigger = treeops.gettriggerverb(current)
                break
            elif "vpi" in text or "vps" in text or "vpg" in text:
                triggertype = "active pp|" + prep
                trigger = treeops.gettriggerverb(current)
                break
            else:  # This verb might be the only one, maybe there's something else to check
                triggertype = "active pp|" + prep
                trigger = treeops.gettriggerverb(current)
                loopcurrent = current
                while loopcurrent.getprevious() is not None:
                    loopcurrent = loopcurrent.getprevious()
                    ltext = treeops.text(loopcurrent)
                    if "advp" in ltext or "np-qual" in ltext or "timex" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps":
                        continue
                    elif "vpp-comp" in ltext:
                        triggertype = "passive pp|" + prep
                        trigger = treeops.gettriggerverb(loopcurrent)
                        break
                    elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext:
                        triggertype = "active pp|" + prep
                        trigger = treeops.gettriggerverb(loopcurrent)
                        break
                    elif "pp" in ltext:
                        continue
                    else:
                        break  #this verb is the only one we have.
                    break
                break
        elif "pp" in text:
            continue
        else:
            break  #nothing found here
    if not trigger:  # Looking after the PP
        current = parent
        while current.getnext() is not None:
            current = current.getnext()
            text = treeops.text(current)
            if "np-qual" in text or "nps-qual" in text or "timex" in text or "advp" in text or text in "np" or text in "nps" or text in "ap" or text in "aps":  #Change: advp and unmarked NPs added
                continue
            elif "np" in text:
                triggertype = "noun pp|" + prep
                trigger = treeops.gettriggernoun(current)
                break
            elif "vp" in text:
                if "vpp-comp" in text:
                    triggertype = "passive pp|" + prep
                    trigger = treeops.gettriggerverb(current)
                    break
                elif "vpi" in text or "vps" in text or "vpg" in text:
                    triggertype = "active pp|" + prep
                    trigger = treeops.gettriggerverb(current)
                    break
                else:
                    triggertype = "active pp|" + prep
                    trigger = treeops.gettriggerverb(current)
                    loopcurrent = current
                    while loopcurrent.getnext() is not None:
                        loopcurrent = loopcurrent.getnext()
                        ltext = treeops.text(loopcurrent)
                        if "advp" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps" or "np-qual" in ltext or "timex" in ltext:
                            continue
                        elif "vpp-comp" in ltext:
                            triggertype = "passive pp|" + prep
                            trigger = treeops.gettriggerverb(loopcurrent)
                            break
                        elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext:
                            triggertype = "active pp|" + prep
                            trigger = treeops.gettriggerverb(loopcurrent)
                            break
                        elif "pp" in ltext:
                            continue
                        else:
                            break
                    break
            elif "scp" in text:
                newnext = current.getnext()
                newtext = treeops.text(newnext)
                if "subj" in newtext:
                    newnext = newnext.getnext()
                    newtext = treeops.text(newnext)
                if "vp" in newtext:
                    if "vpp-comp" in newtext:
                        triggertype = "passive pp|" + prep
                        trigger = treeops.gettriggerverb(current)
                        break
                    elif "vpi" in newtext or "vps" in newtext or "vpg" in newtext:
                        triggertype = "active pp|" + prep
                        trigger = treeops.gettriggerverb(current)
                        break
                    else:
                        triggertype = "active pp|" + prep
                        trigger = treeops.gettriggerverb(current)
                        loopcurrent = current
                        while loopcurrent.getnext() is not None:
                            loopcurrent = loopcurrent.getnext()
                            ltext = treeops.text(loopcurrent)
                            if "advp" in ltext or ltext in "np" or ltext in "nps" or ltext in "ap" or ltext in "aps" or "np-qual" in ltext or "timex" in ltext:
                                continue
                            elif "vpp-comp" in ltext:
                                triggertype = "passive pp|" + prep
                                trigger = treeops.gettriggerverb(loopcurrent)
                                break
                            elif "vpi" in ltext or "vps" in ltext or "vpg" in ltext:
                                triggertype = "active pp|" + prep
                                trigger = treeops.gettriggerverb(loopcurrent)
                                break
                            elif "pp" in ltext:
                                continue
                            else:
                                break
            elif "pp" in text:
                continue
            else:
                break
    if not trigger:
        return "", ""
    return trigger, triggertype