Ejemplo n.º 1
0
def nfold(trees, n=5, stackwindow=2, qwindow=2, tagsize=2, precision=0.95, threshold=500):
    print "nfold, tagsize=%s"%(tagsize)
    a = 0
    ca = 0
    for i in range(0, n):
        training, testing = makefold(i, n, trees)
        if training == []:
            training, testing = testing, training
        print "makeclassifier(%s)"%(i)
        parser = onefold(i, n, training, testing, stackwindow=stackwindow, qwindow=qwindow, tagsize=tagsize, precision=precision, threshold=threshold)
        print "testing[0].dtree before testparser"
        testing[0].showDTree()
        print "testing[0].parsed"
        print tb.showDTree(malt.buildtree(testing[0].parsed, testing[0].leaves))
        a += parser.parseraccuracy
        c = parser.classifier
        ca += c.accuracy
    return a/n, ca/n, c, parser, training, testing
Ejemplo n.º 2
0
def text2term(text):
    t = parser.parse(text, tagger=tagger)
    return tree2term(malt.buildtree(t.parsed, t.leaves))
Ejemplo n.º 3
0
 def parse(self, sentence, tagger=False, keepscore=True, justParse=True):
     t = parse(sentence, self.classifier, tagger=tagger, tagsize=self.tagsize, stackwindow=self.stackwindow, qwindow=self.qwindow, keepscore=keepscore, preclassified=False, justParse=justParse)
     if justParse:
         t.dtree = malt.buildtree(t.parsed, t.leaves)
     return t
Ejemplo n.º 4
0
def parse(tree, classifier, tagger=False, tagsize=5, singlestep=False, silent=True, stackwindow=2, qwindow=3, keepscore=True, preclassified=False, justParse=False):
    malt.SILENT = silent
    if type(tree) == "SENTENCE":
        if tree.dtree:
            s, goldstandard = malt.dtree2state(tree)
        else:
            goldstandard = tree.goldstandard
            s = malt.STATE(text=tree.leaves)
        words = s.words
    else:
        words = [malt.WORD(w[0], w[1]) for w in tagger.tag(tree)]
        tree = tb.SENTENCE(False, False, False, False)
        tree.leaves = words
        s = malt.STATE(text=words)
        goldstandard = False
    for w in s.words:
        w.tag = w.tag[:tagsize]
    features = sorted(classifier.features.keys())
    agenda = [s]
    s.score = 0.0
    while not agenda == []:
        s = agenda.pop()
        if s.queue == []:
            """
            What we'd like to do (pace Sardar) is to insist
            that the stack should have only one item on it,
            and if not then we go on to the next task on
            the agenda. That works nicely if we use the
            head percolation table that has CC as the head
            whenever possible; but overall the one that has
            CC as worst choice for the head does better (as
            with Maytham), and in that case trying to
            get to a nice terminal state by choosing other
            options from the agenda gets stuck. So we have
            to do something more simple-minded:
            attaching the things that haven't been attached
            to anything to their neighbours seems to work
            quite nicely. It's not exactly systematic, but
            it gets a surprising number of things right.
            """
            for i in range(len(s.stack)-1):
                hd = s.stack[i+1]
                dtr = s.stack[i]
                s.relations[dtr.position] = malt.RELATION(hd.position, dtr.position, 'mod')
        d = s.stateDescriptor(False, qwindow=classifier.qwindow, stackwindow=classifier.stackwindow)
        t = id3.INSTANCE(features, d)
        """
        The classifier can return several options: it would
        be nice to weigh them up, using influences from the
        grammar, and then order the agenda to pay due
        attention to the confidence of the classifier and
        the constraints from the grammar, but I can't find
        any useful influences. So I'm just choosing the
        best and working with that. I've left it as a loop,
        even though it's currently a pointless loop because
        the list only ever has one item on it, so that I
        can revisit it some time if I have the energy.
        """
        actions = sortTable(classifier.classify(t, printing=False))
        if actions == []:
            print "no action found"
        for action in actions:
            s1 = s.copy()
            s1.score = s.score+10
            if singlestep:
                print '***********************************'
                s1.showState()
                ss("action %s\ns.relations %s\nd %s\n"%(action, s.relations, d))
            action = eval("malt.STATE.%s"%(action[0]))
            WARNINGS = True
            try:
                if action(s1, warnings=WARNINGS, throwException=True):
                    agenda.append(s1)
                    break
            except Exception as e:
                if agenda == []:
                    if not s1.queue == []:
                        malt.STATE.shift(s1, warnings=WARNINGS)
                        agenda.append(s1)
                    else:
                        break
        agenda.sort(cmp=compstates)
    right = 0
    tree.parsed = s.relations
    if justParse:
        return tree
    if type(tree) == "str" or not keepscore:
        return malt.buildtree(s.relations, s.words)
    else:
        right1 = scoreState(goldstandard, s, tree)
        if preclassified:
            usePreclassifiedHDs(preclassified, s.relations)
            right2 = scoreState(goldstandard, s, tree)
            return right2, len(goldstandard), len(s.stack)
        else:
            return right1, len(goldstandard), len(s.stack)
Ejemplo n.º 5
0
def showtrees(trees, outfile=sys.stdout):
    with safeout(outfile) as out:
        out(r"""
\documentclass[10pt]{article}
\usepackage[a4paper,landscape]{geometry}
\usepackage{headerfooter}
\usepackage{defns}
\usepackage{lscape}
\usepackage{ifthen}
\usepackage{natbib}
\usepackage{lscape}
\usepackage{examples}
\usepackage{multicol}
\usepackage[usenames,dvipsnames,svgnames,table]{xcolor}
\usepackage{pstricks, pst-node, pst-tree}
\usepackage{graphicx}
\oddsidemargin=0in
\evensidemargin=0in
\begin{document}
\begin{examples}
""")

        for tree in trees:
            for leaf in tree.leaves:
                leaf.colour = "black"
            t = simplifytree(tree.dtree)
            d = depth(t)
            out(r"""

\newpage
\item %s

\noindent
DTREE (= Gold Standard)

\noindent
%s
"""%(" ".join(map(lambda x: x.form, tree.leaves)).replace("$", r"\$"),
     pstree(t, lsep=min(70, int(350.0/d)), tsep=20)))
            goldstandard = tree.goldstandard
            parsed = tree.parsed
            out(showTreeAsArcs(tree.leaves, goldstandard))
            for leaf in tree.leaves:
                i = leaf.position
                if i in goldstandard and i in parsed:
                    if not goldstandard[i].hd == parsed[i].hd:
                        leaf.colour = "red"
                elif i in goldstandard or i in parsed:
                    leaf.colour = "red"
            t = simplifytree(buildtree(tree.parsed, tree.leaves))
            d = depth(t)
            out(r"""         
\newpage
\noindent
PARSED

\noindent
%s
"""%(pstree(t, lsep=min(70, int(350/d)), tsep=20)))
            out(plantlinks(tree))
            out(showTreeAsArcs(tree.leaves, parsed))
        out(r"""
\end{examples}
\end{document}
""")
    if not outfile == sys.stdout:
        subprocess.Popen(["latex", outfile]).wait()
        subprocess.Popen(["dvipdf", outfile[:-4]]).wait()
        print "dvipdf complete"
Ejemplo n.º 6
0
def readconll(conllfile="%s/ud-treebanks-v1.1/UD_English/wholething.txt"%(programs)):
    specials = {"-":"DASH", 
                "that":"THAT",
                "if":"IF",
                "his":"PX",
                "my":"PX",
                "our":"PX",
                "your":"PX",
                "their":"PX",
                "sure":"JJ",
                "ago":"NI",
                "of":"OF",
                "has":"VH", "had":"VH", "have":"VH", "having":"VH",
                "be":"VX", "am":"VX", "is":"VX", "are":"VX", "was":"VX", "were":"VX", "being":"VX", "been":"VX", }
    preps = ['@', 'about', 'across', 'after', 'although', 'as', 'at', 'because', 'before', 'behind', 'besides', 'between', 'beyond', 'by', 'due', 'during', 'for', 'in', 'into', 'near', 'on', 'over', 'out', 'per', 'since', 'than', 'through', 'till', 'under', 'up', 'vs', 'vs.', 'whereas', 'while', 'with']
    for p in preps:
        specials[p] = "IN"
    sentences = []
    words = []
    sentence = []
    n = 0
    for line in open(conllfile).readlines():
        line = line.strip()
        m = numPattern.match(line)
        line = line.replace("``", '"').replace("''", '"')
        if line == "":
            if len(sentence) > 1:
                relations = [RELATION(word.hd, word.position, rel=word.label) for word in sentence]
                preswap(sentence, relations)
                tree = buildtree(relations, sentence)
                leaves = sentence
                sentence = SENTENCE(leaves, conllfile, n, n)
                sentence.dtree = tree
                sentence.leaves = leaves
                relations = {r.dtr:r for r in relations}
                sentence.goldstandard = relations
                sentence.parsed = relations
                sentences.append(sentence)
            sentence = []
        else:
            word = line.split("\t")
            POSITION = 0
            FORM = 1
            TAG = 4
            HD = 6
            LABEL = 7
            form = word[FORM]
            if form[0] in "?.!*+-=":
                word[FORM] = form[0]
                form = word[FORM]
            if re.compile("\d+(\.\d+)").match(form):
                word[FORM] = "9999"
                form = word[FORM]
            if word[TAG] == "NNP":
                word[TAG] = "NP"
            else:
                word[FORM] = form.lower()
                form = word[FORM]
            if form in specials:
                word[TAG] = specials[form]
            word = WORD(form=word[FORM], tag=word[TAG], label=word[LABEL], hd=int(word[HD])-1, position=int(word[POSITION])-1)
            sentence.append(word)
            words.append(word)
    if len(sentence) > 1:
        preswap(sentence)
        tree = buildtree([RELATION(word.hd, word.position, rel=word.label) for word in sentence], sentence)
        sentence = SENTENCE(words, conllfile, n, n)
        sentence.dtree = tree
        sentence.leaves = words
        sentences.append(sentence)
    return sentences, words