Beispiel #1
0
    def evaluate(self, trees):
        from spanparser.parser import Parser as SParser
        from spanparser.phrase_tree import FScore

        const_acc = FScore()
        disc_const_acc = FScore()
        seg_acc, span_acc, nucs_acc, labels_acc = FScore(), FScore(), FScore(), FScore()
        for tree in trees:
            predicted = None
            predicted = SParser.parse(tree.sentence, self.fm, self.network)

            local_acc = predicted.compare(tree)
            const_acc += local_acc
            local_disc_acc = compare_disc_constituency(predicted, tree)
            disc_const_acc += local_disc_acc

            predicted_rst = const2rst(Tree.parse(str(predicted)),
                                      keep_punct=FLAGS.eval_rstpunct)
            ref_rst = const2rst(Tree.parse(str(tree)),
                                keep_punct=FLAGS.eval_rstpunct)
            local_accs = evaltrees(predicted_rst, ref_rst)
            seg_acc += local_accs['segs']
            span_acc += local_accs['spans']
            nucs_acc += local_accs['nucs']
            labels_acc += local_accs['labels']

        return {"const": const_acc,
                "disc_const": disc_const_acc,
                "seg": seg_acc,
                "span": span_acc,
                "nucs": nucs_acc,
                "labels": labels_acc}
Beispiel #2
0
def main():
    from spanparser.phrase_tree import PhraseTree, FScore
    from spanparser.parser import Parser as SParser
    from spanparser.features import FeatureMapper
    from spanparser.network import Network
    from rst_evaluation import const2rst, evaltrees, compare_disc_constituency
    from tree import Tree

    SParser.extlabelfeatures = FLAGS.extlabelfeatures

    training_trees = PhraseTree.load_treefile(FLAGS.train)
    fm = FeatureMapper(training_trees)
    network = Network.load(FLAGS.model)
    print("Loaded model from {}".format(FLAGS.model), file=sys.stderr)

    trees = PhraseTree.load_treefile(FLAGS.test)
    print("Evaluating on {}".format(FLAGS.test), file=sys.stderr)

    const_acc = FScore()
    dis_const_acc = FScore()
    seg_acc, span_acc, nucs_acc, labels_acc = FScore(), FScore(), FScore(
    ), FScore()
    label_specific_acc = defaultdict(FScore)
    for i, tree in enumerate(trees):
        predicted = None
        predicted = SParser.parse(tree.sentence, fm, network)
        local_const_acc = predicted.compare(tree)
        local_dis_const_acc = compare_disc_constituency(predicted, tree)
        const_acc += local_const_acc
        dis_const_acc += local_dis_const_acc

        predicted_rst = const2rst(Tree.parse(str(predicted)),
                                  keep_punct=FLAGS.eval_rstpunct)
        ref_rst = const2rst(Tree.parse(str(tree)),
                            keep_punct=FLAGS.eval_rstpunct)
        accuracies = evaltrees(predicted_rst, ref_rst, label_specific=True)

        if FLAGS.verbose:
            print("######## Tree {} ########".format(i))
            print("# input")
            print(tree.sentence)
            print("# reference")
            print(ref_rst.pretty_str())
            print("# predicted")
            print(predicted_rst.pretty_str())

        seg_acc += accuracies['segs']
        span_acc += accuracies['spans']
        nucs_acc += accuracies['nucs']
        labels_acc += accuracies['labels']
        for label, acc in accuracies['label_specific'].iteritems():
            label_specific_acc[label] += acc

    print('Const {}, DIS:Const {}, Seg {}, Span {}, Nucs {}, Labels {}'.format(
        const_acc, dis_const_acc, seg_acc, span_acc, nucs_acc, labels_acc),
          file=sys.stderr)
    print('----------- label-specific accuracies -----------', file=sys.stderr)
    for label, acc in label_specific_acc.iteritems():
        print(label, acc, file=sys.stderr)
Beispiel #3
0
def main():

    for line in sys.stdin:
        tree = treelib.parse(line)

        tree = binarize(tree)
        print(tree)
Beispiel #4
0
def readkparses(f, k):
	for j in xrange(k):
		fvector = FVector.parse(f.readline().strip())   #float(f.readline().strip())
		parse = f.readline().strip()
		tree = Tree.parse(parse, trunc=True, lower=True)
		
		yield (fvector, tree)				
Beispiel #5
0
def check_in_language(tree_str, conf_file):
    cfg_f, cfg_r = read_features(conf_file)
    sent_t = Tree.parse(tree_str.strip(), trunc=False)
    symbols, all_constraints = get_sat_constraints(sent_t, cfg_f, cfg_r, {},
                                                   [])
    problem = And(all_constraints)
    model = get_model(problem)
    if model:
        return tree_to_str(sent_t)
    else:
        return None
Beispiel #6
0
def readonebest(f):
	'''1-best output, or gold'''
							 
	f = getfile(f)
	while True:
		line = f.readline()
		if line == '':
			break
		if line == '\n':
			continue

		yield Tree.parse(line.strip(), trunc=True, lower=True)
    def load_data_file(self, file_path, word_counter):
        data = []
        f = open(file_path, 'r')
        for line in f:
            line_split = line.strip().split('\t')
            gold_label = line_split[0]
            if self.relations.has_key(gold_label):
                premise = line_split[1].strip().split()
                hypothese = line_split[2].strip().split()

                if self.lower_case:
                    premise = [p.lower() for p in premise]
                    hypothese = [h.lower() for h in hypothese]

                for p in premise:
                    self.inc_word_counts(p, word_counter)
                for h in hypothese:
                    self.inc_word_counts(h, word_counter)

                p_tree_str = ' '.join(premise)
                h_tree_str = ' '.join(hypothese)

                p_tree = Tree()
                h_tree = Tree()
                p_tree.parse(p_tree_str)
                h_tree.parse(h_tree_str)

                data.append({
                    'label': self.relations[gold_label],
                    'id': len(data),
                    # 'premise': ptree_str,
                    'p_tree': p_tree,
                    # 'hypothese': htree_str,
                    'h_tree': h_tree
                })
            else:
                # printerr('Error loading' + line)
                pass

        return data
Beispiel #8
0
def main():

    label_counts = {}
    pcfg = {}

    for line in sys.stdin:
        tree = treelib.parse(line)

        update_counts(tree, label_counts, pcfg)

    normalize_pcfg(pcfg, label_counts)

    print_pcfg(pcfg)
def binarizeTree(tree):
    if tree.word == None:
        for t in tree.subs:
            binarizeTree(t)
        print('tree here: {} subs: {}'.format(tree, tree.subs))
        if len(tree.subs) > 2:
            newlabel = tree.label + "'"
            temptreestr = '(' + newlabel + ' ' + ' '.join(
                [t.dostr() for t in tree.subs[1:]]) + ')'
            print('subs: {} new label: {} temptreestr: {}'.format(
                tree.subs, newlabel, temptreestr))
            temptree = Tree.parse(temptreestr)
            binarizeTree(temptree)
            ##                        tree = '('+tree.label+' '+temptree.dostr()+')'
            print('new original tree: {}'.format(tree))
        return tree
def main():

    word_counts = {}
    trees = []

    for line in sys.stdin:
	tree = treelib.parse(line)
	trees.append(tree)

        for word in get_words(tree, []):
	    if word not in word_counts:
	        word_counts[word] = 1
	    else:
	        word_counts[word] += 1

    for tree in trees:
	print(replace_onecount(tree, word_counts))
Beispiel #11
0
def load_trees():
    rstpath = "%s/" % FLAGS.rst_path
    constpath = "%s/" % FLAGS.const_path
    treepairs = []
    for rstf in glob.glob(rstpath + "*.dis.tok"):
        basename = rstf.rsplit("/", 1)[1].split(".")[0]
        try:
            if basename.startswith("wsj"):
                rstline = " ".join(
                    [line.strip() for line in open(rstf).readlines()])
                rstt = RSTTree.parse(rstline)
                constts = [
                    Tree.parse(line.strip())
                    for line in open(constpath + basename + ".cleangold")
                ]
                treepairs.append((basename, rstt, constts))
        except:
            print >> logs, "Failed in loading", basename
    return treepairs
Beispiel #12
0
def readkbest(f, read_gold=False):

	f = getfile(f)
	while True: #now < len(lines):
		line = f.readline() #lines[now]
		if line == '':
			break
		if line == '\n':
			continue
		try:
			k, tag = line.strip().split("\t")
			k = int(k)
		except:
			break  ## can finish earlier

		kparses = []
		for stuff in readkparses(f, int(k)):
			kparses.append(stuff)
			
		goldtree = Tree.parse(f.readline().strip(), trunc=True, lower=True) if read_gold \
				   else None
 		yield NBestForest(k, tag, kparses, goldtree)
Beispiel #13
0
    def load(filename, lower=False, sentid=0):
        '''now return a generator! use load().next() for singleton.
		   and read the last line as the gold tree -- TODO: optional!
		   and there is an empty line at the end
		'''

        file = getfile(filename)
        line = None
        total_time = 0
        num_sents = 0

        while True:

            start_time = time.time()
            ##'\tThe complicated language in ...\n"
            ## tag is often missing
            try:
                if line is None or line == "\n":
                    line = "\n"
                    while line == "\n":
                        line = file.readline()  # emulate seek
                tag, sent = line.split("\t")
            except:
                ## no more forests
                break

            num_sents += 1

            sent = sent.split()
            cased_sent = sent[:]
            if lower:
                sent = [w.lower()
                        for w in sent]  # mark johnson: lowercase all words
            num = int(file.readline())

            forest = Forest(num, sent, cased_sent, tag)
            forest.labelspans = {}
            forest.short_edges = {}

            delta = num_spu = 0
            for i in xrange(1, num + 1):

                ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n'
                ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ...
                line = file.readline()
                try:
                    keys, fields = line.split(" ||| ")
                except:
                    keys = line
                    fields = ""

                iden, labelspan, size = keys.split(
                    "\t")  ## iden can be non-ints
                size = int(size)

                fvector = FVector(fields)
                node = Node(iden, labelspan, size, fvector, sent)
                forest.add_node(node)

                if cache_same:
                    if labelspan in forest.labelspans:
                        node.same = forest.labelspans[labelspan]
                        node.fvector = node.same.fvector
                    else:
                        forest.labelspans[labelspan] = node

                for j in xrange(size):
                    is_oracle = False

                    ## '\t1 ||| 0=8.86276 1=2 3\n'
                    tails, fields = file.readline().strip().split(" ||| ")

                    if tails[0] == "*":  #oracle edge
                        is_oracle = True
                        tails = tails[1:]

                    tails = tails.split()  ## could be non-integers
                    tailnodes = []

                    for x in tails:
                        assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \
                            "(in a hyperedge of node #%s) before being defined" % iden
                        ## topological ordering
                        tail = forest.nodes[x]
                        tailnodes.append(tail)

                    use_same = False
                    if fields[-1] == "~":
                        use_same = True
                        fields = fields[:-1]

                    fvector = FVector(fields)
                    edge = Hyperedge(node, tailnodes, fvector)

                    if cache_same:

                        short_edge = edge.shorter()
                        if short_edge in forest.short_edges:
                            edge.same = forest.short_edges[short_edge]
                            if use_same:
                                edge.fvector += edge.same.fvector
                        else:
                            forest.short_edges[short_edge] = edge

                    node.add_edge(edge)
                    if is_oracle:
                        node.oracle_edge = edge

                if node.sp_terminal():
                    node.word = node.edges[0].subs[0].word

            ## splitted nodes 12-3-4 => (12, 3, 4)
            tmp = sorted([(map(int, x.iden.split("-")), x)
                          for x in forest.nodeorder])
            forest.nodeorder = [x for (_, x) in tmp]

            forest.rehash()
            sentid += 1

            ##			print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \
            ##				  % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime)

            forest.root = node
            node.set_root(True)

            line = file.readline()

            if line is not None and line.strip() != "":
                if line[0] == "(":
                    forest.goldtree = Tree.parse(line.strip(),
                                                 trunc=True,
                                                 lower=False)
                    line = file.readline()
            else:
                line = None

            total_time += time.time() - start_time

            if num_sents % 100 == 0:
                print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \
                   % (num_sents, total_time/num_sents)

            yield forest

        Forest.load_time = total_time
        print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \
           % (num_sents, total_time, total_time/num_sents)
from collections import defaultdict
from tree import Tree
from pprint import pprint
import sys

fin = sys.stdin
fout = sys.stdout
ferr = sys.stderr

trees = [Tree.parse(line.strip()) for line in fin.readlines()]


def binarize(tree):
    # no children
    if tree.subs == None:
        return tree

    if len(tree.subs) <= 2:
        newsubs = []
        for subtree in tree.subs:
            newsubs.append(binarize(subtree))

        return Tree(tree.label, tree.span, subs=newsubs)
    else:
        subs = [tree.subs[0]]
        newsubs = tree.subs[1:]
        newspan = [subs[0].span[1], tree.span[0]]

        if '_' in tree.label:
            newlabel = tree.label
        else:
Beispiel #15
0
                lexicon.add(word.strip().split()[0])

    # read in grammers
    with open(sys.argv[1], 'r') as f:
        rules = f.readlines()
    for rule in rules[1:]:
        p = rule.split()
        k1 = p[0]
        if len(p) == 5:
            k2 = (p[2], )
            runa[k1][k2] = -math.log(float(p[-1]))
        else:
            k2 = (p[2], p[3])
            rbin[k1][k2] = -math.log(float(p[-1]))

    for line in sys.stdin:
        sentence_origin = line.strip().split()
        sentence = []
        for x in sentence_origin:
            if len(sys.argv) > 2 and x not in lexicon:
                x = '<unk>'
            sentence.append(x)
        dp, back = cky(sentence)

        print(math.exp(-dp[0][len(sentence)]['TOP']))
        res = backtrack(0, len(sentence), 'TOP', sentence_origin, back)
        if len(res) != 0:
            print debinarize(Tree.parse(res))
        else:
            print 'NONE'
Beispiel #16
0
from __future__ import division
from tree import Tree
from collections import defaultdict
import sys

TRAINFILE = sys.argv[1]
PRODFILE = sys.argv[2]

freqs = defaultdict(int)
condCounts = defaultdict(int)

with open(TRAINFILE, "r") as f:

    for line in f.readlines():
        line = line.strip()

        t = Tree.parse(line, trunc=True)
        #t.binarize()

        #print t.getProductions()
        prods = t.getProductions()

        for (x, y) in prods:
            freqs[(x, y)] += 1
            condCounts[x] += 1

with open(PRODFILE, "w") as fw:
    for (x, y), freq in freqs.iteritems():
        p = freq / condCounts[x]
        fw.write("%s -> %s # %.4f\n" % (x, y, p))
Beispiel #17
0
    def load(filename, is_tforest=False, lower=False, sentid=0, first=None, lm=None):
        '''now returns a generator! use load().next() for singleton.
           and read the last line as the gold tree -- TODO: optional!
           and there is an empty line at the end
        '''
        if first is None: # N.B.: must be here, not in the param line (after program initializes)
            first = FLAGS.first
            
        file = getfile(filename)
        line = None
        total_time = 0
        num_sents = 0        
        
        while True:            
            
            start_time = time.time()
            ##'\tThe complicated language in ...\n"
            ## tag is often missing
            line = file.readline()  # emulate seek
            if len(line) == 0:
                break
            try:
                ## strict format, no consecutive breaks
#                 if line is None or line == "\n":
#                     line = "\n"
#                     while line == "\n":
#                         line = file.readline()  # emulate seek
                        
                tag, sent = line.split("\t")   # foreign sentence
                
            except:
                ## no more forests
                yield None
                continue

            num_sents += 1

            # caching the original, word-based, true-case sentence
            sent = sent.split() ## no splitting with " "
            cased_sent = sent [:]            
            if lower:
                sent = [w.lower() for w in sent]   # mark johnson: lowercase all words

            #sent = words_to_chars(sent, encode_back=True)  # split to chars

            ## read in references
            refnum = int(file.readline().strip())
            refs = []
            for i in xrange(refnum):
                refs.append(file.readline().strip())

            ## sizes: number of nodes, number of edges (optional)
            num, nedges = map(int, file.readline().split("\t"))   

            forest = Forest(sent, cased_sent, tag, is_tforest)

            forest.tag = tag

            forest.refs = refs
            forest.bleu = Bleu(refs=refs)  ## initial (empty test) bleu; used repeatedly later
            
            forest.labelspans = {}
            forest.short_edges = {}
            forest.rules = {}

            for i in xrange(1, num+1):

                ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n'
                ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ...
                line = file.readline()
                try:
                    keys, fields = line.split(" ||| ")
                except:
                    keys = line
                    fields = ""

                iden, labelspan, size = keys.split("\t") ## iden can be non-ints
                size = int(size)

                fvector = Vector(fields) #
##                remove_blacklist(fvector)
                node = Node(iden, labelspan, size, fvector, sent)
                forest.add_node(node)

                if cache_same:
                    if labelspan in forest.labelspans:
                        node.same = forest.labelspans[labelspan]
                        node.fvector = node.same.fvector
                    else:
                        forest.labelspans[labelspan] = node

                for j in xrange(size):
                    is_oracle = False

                    ## '\t1 ||| 0=8.86276 1=2 3\n'
                    ## N.B.: can't just strip! "\t... ||| ... ||| \n" => 2 fields instead of 3
                    tails, rule, fields = file.readline().strip("\t\n").split(" ||| ")

                    if tails != "" and tails[0] == "*":  #oracle edge
                        is_oracle = True
                        tails = tails[1:]

                    tails = tails.split() ## N.B.: don't split by " "!
                    tailnodes = []
                    lhsstr = [] # 123 "thank" 456

                    lmstr = []
                    lmscore = 0
                    lmlhsstr = []
                    
                    for x in tails:
                        if x[0]=='"': # word
                            word = desymbol(x[1:-1])
                            lhsstr.append(word)  ## desymbol here and only here; ump will call quoteattr
                            
                            if lm is not None:
                                this = lm.word2index(word)
                                lmscore += lm.ngram.wordprob(this, lmstr)
                                lmlhsstr.append(this)
                                lmstr += [this,]
                                
                        else: # variable

                            assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \
                                         "(in a hyperedge of node #%s) before being defined" % iden
                            tail = forest.nodes[x]
                            tailnodes.append(tail)
                            lhsstr.append(tail)                            

                            if lm is not None:
                                lmstr = []  # "..." "..." x0 "..."
                                lmlhsstr.append(tail) # sync with lhsstr

                    fvector = Vector(fields)
                    if lm is not None:
                        fvector["lm1"] = lmscore # hack

                    edge = Hyperedge(node, tailnodes, fvector, lhsstr)
                    edge.lmlhsstr = lmlhsstr

                    ## new
                    x = rule.split()
                    edge.ruleid = int(x[0])
                    if len(x) > 1:
                        edge.rule = Rule.parse(" ".join(x[1:]) + " ### " + fields)
                        forest.rules[edge.ruleid] = edge.rule #" ".join(x[1:]) #, None)
                    else:
                        edge.rule = forest.rules[edge.ruleid] # cahced rule

                    node.add_edge(edge)
                    if is_oracle:
                        node.oracle_edge = edge
                    
                if node.sp_terminal():
                    node.word = node.edges[0].subs[0].word

            ## splitted nodes 12-3-4 => (12, 3, 4)
            tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder])   
            forest.nodeorder = [x for (_, x) in tmp]

            forest.rehash()
            sentid += 1
            
##            print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \
##                  % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime)

            forest.root = node
            node.set_root(True)
            line = file.readline()

            if line is not None and line.strip() != "":
                if line[0] == "(":
                    forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=False)
                    line = file.readline()
            else:
                line = None

            forest.number_nodes()
            #print forest.root.position_id
          

            total_time += time.time() - start_time

            if num_sents % 100 == 0:
                print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \
                      % (num_sents, total_time/num_sents)

            forest.subtree() #compute the subtree string for each node

            yield forest

            if first is not None and num_sents >= first:
                break                

        # better check here instead of zero-division exception
        if num_sents == 0:
            print >> logs, "NO FORESTS FOUND!!! (empty input file?)"
            sys.exit(1)            
#            yield None # new: don't halt -- WHY?
        
        Forest.load_time = total_time
        print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \
              % (num_sents, total_time, total_time/(num_sents+0.001))
Beispiel #18
0
        #update prob for label -> left child
        probdict[tree.label][updatedTag] += 1

        #recurse on left child
        getCounts(tree.subs[0], countdict, probdict)

        #if right child exists, recurse on right child
        if len(tree.subs) > 1:
            getCounts(tree.subs[1], countdict, probdict)

    return (probdict, countdict)


for line in sys.stdin:
    tree = Tree.parse(line)
    getCounts(tree, countdict, probdict)

#divide probdict counts by overall num tags to get probs
for i in probdict:
    for j in probdict[i]:
        probdict[i][j] /= float(countdict[i])

#print results
binary = 0
unary = 0
lexical = 0
print("TOP")
for i in probdict:
    for j in probdict[i]:
        if j in probdict.keys():
Beispiel #19
0
    else:
        tmp = ' '.join([s.label for s in t.subs])
        h[t.label][tmp] += 1
        for sub in t.subs:
            count(sub, h)
            #tmp.append(tree_to_str(sub))
        #return "(%s %s)" % (t.label, ' '.join(tmp))


############################
lines = list(sys.stdin)

h = defaultdict(lambda: defaultdict(lambda: 0))

for i, line in enumerate(lines):
    t = Tree.parse(line.strip(), trunc=False)

    count(t, h)

    #t.pp()
    #print(print_tree(t))
    #print_subs(t)

    #binarize(t)
    #t.pp()
    #print(tree_to_str(t))
    #print_subs(t)
print('TOP')
for k in h:
    s = sum(h[k].values())
    for j in h[k]:
Beispiel #20
0
import itertools, collections
from tree import Tree

if __name__ == "__main__":
    try:
        _, parsefilename, goldfilename = sys.argv
    except:
        print >> logs, "usage: evalb.py <parse-file> <gold-file>\n"
        sys.exit(1)

    matchcount = parsecount = goldcount = 0

    for parseline, goldline in itertools.izip(open(parsefilename),
                                              open(goldfilename)):
        goldtree = Tree.parse(goldline)
        goldbrackets = goldtree.label_span_counts()
        goldcount += sum(goldbrackets.values())

        if parseline.strip() == "NONE":  # parsing failure
            continue

        parsetree = Tree.parse(parseline)
        parsebrackets = parsetree.label_span_counts()
        parsecount += sum(parsebrackets.values())

        for bracket, count in parsebrackets.iteritems():
            matchcount += min(count, goldbrackets[bracket])

    print "%s\t%d brackets" % (parsefilename, parsecount)
    print "%s\t%d brackets" % (goldfilename, goldcount)
        
    print features.prep_features(sys.argv[1:])

    f = sys.stdin
    while True:
        line = f.readline()
        if line == '':
            break
        if line == '\n':
            continue
        try:
            k, tag = line.strip().split("\t")
        except:
            break  ## can finish earlier
         print k, tag
        k = int(k)
        best_w = None
        for j in xrange(k):
            logprob = float(f.readline().strip())
            parse = f.readline().strip()
            tree = Tree.parse(parse)
            ##print tree

            if j < maxk:
##                print tree
                features.evaluate(tree, tree.get_sent(), j)
                ##print features.pp_fv(fvector, j)
                ##fvector = features.extract(tree, tree.get_sent())
                
                
Beispiel #22
0
                    for Y in unary:
                        if score[Y][i][j] > 0:
                            sc = 0
                            sc = pdict[X][Y] * score[Y][i][j]
                            if sc > score[X][i][j]:
                                score[X][i][j] = sc
                                back[X][i][j] = [Y, j]
                                changed = True

##    ferr.write('\n')
##    ferr.write('Dictionary Final'+'\n')
##    print_dict(score,back)

    try:
        trees.append(
            str(
                debinarize(
                    Tree.parse(backtrace(back, start, 0, len(line)).strip()))))
        scores.append(score[start][0][len(line)])
    except:
        trees.append(None)
        scores.append(None)
    ferr.write(str(trees[-1]) + '\n')
    ferr.write(str(scores[-1]))

for tree in trees:
    if tree is None:
        fout.write(str(tree) + '\n')
    else:
        fout.write(tree + '\n')
logs = sys.stderr

import itertools, collections
from tree import Tree

if __name__ == "__main__":
    try:
        _, parsefilename, goldfilename = sys.argv
    except:
        print >> logs, "usage: evalb.py <parse-file> <gold-file>\n"
        sys.exit(1)

    matchcount = parsecount = goldcount = 0

    for parseline, goldline in itertools.izip(open(parsefilename), open(goldfilename)):
        goldtree = Tree.parse(goldline)
        goldbrackets = goldtree.label_span_counts()    
        goldcount += len(goldbrackets)

        if parseline.strip() == "NONE": # parsing failure
            continue

        parsetree = Tree.parse(parseline)
        parsebrackets = parsetree.label_span_counts()
        parsecount += len(parsebrackets)

        for bracket, count in parsebrackets.iteritems():
            matchcount += min(count, goldbrackets[bracket])

    print "%s\t%d brackets" % (parsefilename, parsecount)
    print "%s\t%d brackets" % (goldfilename, goldcount)
import sys
from tree import Tree

DIRFILE = sys.argv[1]
TRAINTREE = sys.argv[2]
TESTTXT = sys.argv[3]
TESTTREE = sys.argv[4]
M = int(sys.argv[5])
N = int(sys.argv[6])
print M, N

with open(DIRFILE, "r") as f:
    with open(TESTTXT, "w") as fw:
        with open(TESTTREE, "w") as fwt:
            l = f.readlines()
            for line in l[-N:]:
                fwt.write(line)
                t = Tree.parse(line.strip())
                fw.write(t.tosent() + '\n')

with open(DIRFILE, "r") as f:
    with open(TRAINTREE, "w") as ftrain:
        l = f.readlines()
        for line in l[:M]:
            ftrain.write(line)
Beispiel #25
0
                    res += self.debinarize(x) + ' '
                else:
                    res += self.debinarize(x)
        return res

    def parse(self, sentence_origin, kbest):
        sentence = []
        for x in sentence_origin:
            if len(sys.argv) > 2 and x not in self.lexicon:
                x = '<unk>'
            sentence.append(x)
        dp = self.cky(sentence, kbest)
        result = self.backtrack(0, len(sentence), 'TOP', sentence_origin, dp)
        return result


if __name__ == "__main__":
    parser = Parser()
    kbest = 5
    for line in sys.stdin:
        sentence_origin = line.strip().split()

        result = parser.parse(sentence_origin, kbest)
        print(result)
        for score, res in result:
            print(score)
            if len(res) != 0:
                print(parser.debinarize(Tree.parse(res)))
            else:
                print('NONE')
Beispiel #26
0
def extract_frames_from_parse(parse_tree_string):
    """Take a string representing the parse tree as input, and print the
    semantic parse. The result list consists of a list of tuples, with each
    tuple containing the VerbNet frame and its associated tree."""
    result_list = []

    # In case we're handed an bad string, bail somewhat gracefully
    try:
        parse_tree = Tree.parse(parse_tree_string)
    except ValueError:
        print "Warning: semantics could not parse tree", repr(parse_tree_string)
        return result_list

    # Split clauses to handle them separately
    split_clause_dict = frames.split_clauses(parse_tree)

    # Activize clauses
    for key, (clause, conjunction) in split_clause_dict.items():
        activized_clause = frames.activize_clause(clause)
        split_clause_dict[key] = (activized_clause, conjunction)

    for (clause, conjunction) in split_clause_dict.values():
        # Split conjunctions and duplicate arguments if necessary
        split_tree_dict = frames.split_conjunctions(clause)

        if conjunction != '':
            result_list.append(conjunction)

        for (split_tree, conjunction) in split_tree_dict.values():
            if conjunction != '':
                result_list.append(conjunction)

            for tree in split_tree:
                tag_list = []

                # Store whether there was an existential there
                if frames.is_existential(str(tree)):
                    tag_list.append('ex')

                # Transformational grammar stuff
                tree = frames.existential_there_insertion(tree)
                tree = frames.invert_clause(tree)
                tree = frames.wh_movement(tree)

                if EXTRACT_DEBUG:
                    print 'Transformed tree:'
                    print str(tree)

                verbs = frames.find_verbs(tree)

                # Create VFOs for each verb, then match them to the parse tree
                for verb, negation in verbs:
                    lemmatized_verb = morphy(verb, 'v')
                    vfo_list = frames.create_VerbFrameObjects(lemmatized_verb)
                    match_list = []

                    if EXTRACT_DEBUG:
                        print 'VFO list for %s:' % verb
                        print '\n'.join(str(vfo.frame_list) for vfo in vfo_list)

                    for vfo in vfo_list:
                        match = vfo.match_parse(tree)

                        if match:
                            if EXTRACT_DEBUG:
                                print 'Matched:'
                                print '\t', str(vfo.frame_list)
                                print 'with'
                                print '\t', str(tree)
                            match_list.append((match, vfo.classid))

                    if EXTRACT_DEBUG:
                        print 'Match list:'

                        for m in match_list:
                            print 'Sense:', m[1]
                            for a, b in m[0].items():
                                print a, str(b)
                            print '\n\n'

                    (best_match, sense) = frames.pick_best_match(match_list)

                    if EXTRACT_DEBUG:
                        print 'Chose: '
                        if best_match:
                            for a, b in best_match.items():
                                print a, str(b)
                        else:
                            print str(None)
                        print '\n\n'
                    if not best_match is None:
                        result_list.append((best_match, tree, tag_list, sense, verb, negation))

    return result_list
#!/usr/bin/python

'''
Reads parse trees from a treebank (each line contains one parse tree)
Converts that tree into a binary tree (input is not necessarily binary)
'''


from tree import Tree
import sys

for line in sys.stdin:
    line = line.strip()
    t = Tree.parse(line)

    # convert to binary and print
    t.binarize()
    print t
Beispiel #28
0
def get_semantics_from_parse_tree(parse_tree_string):
    """Take a string representing the parse tree as input, and print the
    semantic parse. The result list consists of a list of tuples, with each
    tuple containing the VerbNet frame and its associated tree."""
    parse_tree = Tree.parse(parse_tree_string)

    # Split clauses to handle them separately
    split_clause_dict = frames.split_clauses(parse_tree)

    # Activize clauses
    for key, (clause, conjunction) in split_clause_dict.items():
        activized_clause = frames.activize_clause(clause)
        split_clause_dict[key] = (activized_clause, conjunction)

    result_list = []
        
    for (clause, conjunction) in split_clause_dict.values():
        # Split conjunctions and duplicate arguments if necessary
        split_tree_dict = frames.split_conjunctions(clause)
        
        if conjunction != '':
            result_list.append(conjunction)
        
        for (split_tree, conjunction) in split_tree_dict.values():
            if conjunction != '':
                result_list.append(conjunction)

            for tree in split_tree:
                tag_list = []

                # Store whether there was an existential there
                if frames.is_existential(str(tree)):
                    tag_list.append('ex')

                # Transformational grammar stuff
                tree = frames.existential_there_insertion(tree)
                tree = frames.invert_clause(tree)
                tree = frames.wh_movement(tree)



                # Regex for finding verbs 
                verb_finder = re.compile(r'(?<=VB[ DGNPZ]) *\w*(?=\))')

                # Get the lemma of the verb for searching verbnet
                verbs = (word.strip().lower() for word in
                         verb_finder.findall(str(tree)))

                # Create VFOs for each verb, then match them to the parse tree
                for verb in verbs:
                    lemmatized_verb = morphy(verb,'v')
                    vfo_list = frames.create_VerbFrameObjects(lemmatized_verb)

                    match_list = []
                    
                    for vfo in vfo_list:
                        match = vfo.match_parse(tree)
                        
                        if match:
                            match_list.append((match, vfo.classid))

                    (best_match, sense) = frames.pick_best_match(match_list)
                    if not best_match is None:
                        result_list.append((best_match, tree, tag_list, sense))
                    

    return result_list
Beispiel #29
0
    def load(filename, lower=True, sentid=0):
        '''now return a generator! use load().next() for singleton.
           and read the last line as the gold tree -- TODO: optional!
           and there is an empty line at the end
        '''

        file = getfile(filename)
        line = None
        total_time = 0
        num_sents = 0
        
        while True:            
            
            start_time = time.time()
            ##'\tThe complicated language in ...\n"
            ## tag is often missing
            try:
                if line is None or line == "\n":
                    line = "\n"
                    while line == "\n":
                        line = file.readline()  # emulate seek                    
                tag, sent = line.split("\t")
            except:
                ## no more forests
                break

            num_sents += 1
            
            sent = sent.split()
            cased_sent = sent [:]
            if lower:
                sent = [w.lower() for w in sent]   # mark johnson: lowercase all words
            num = int(file.readline())

            forest = Forest(num, sent, cased_sent, tag)
            forest.labelspans = {}
            forest.short_edges = {}

            delta = num_spu = 0
            for i in xrange(1, num+1):

                ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n'
                ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ...
                line = file.readline()
                try:
                    keys, fields = line.split(" ||| ")
                except:
                    keys = line
                    fields = ""


                iden, labelspan, size = keys.split("\t") ## iden can be non-ints
                size = int(size)

                fvector = FVector.parse(fields)
                node = Node(iden, labelspan, size, fvector, sent)
                forest.add_node(node)

                if cache_same:
                    if labelspan in forest.labelspans:
                        node.same = forest.labelspans[labelspan]
                        node.fvector = node.same.fvector
                    else:
                        forest.labelspans[labelspan] = node

                for j in xrange(size):
                    is_oracle = False

                    ## '\t1 ||| 0=8.86276 1=2 3\n'
                    tails, fields = file.readline().strip().split(" ||| ")
                    
                    if tails[0] == "*":  #oracle edge
                        is_oracle = True
                        tails = tails[1:]
                        
                    tails = tails.split() ## could be non-integers
                    tailnodes = []

                    for x in tails:
                        assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \
                               "(in a hyperedge of node #%s) before being defined" % iden
                        ## topological ordering
                        tail = forest.nodes[x]
                        tailnodes.append(tail)

                    use_same = False
                    if fields[-1] == "~":
                        use_same = True
                        fields = fields[:-1]
                        
                    fvector = FVector.parse(fields)
                    edge = Hyperedge(node, tailnodes, fvector)

                    if cache_same:

                        short_edge = edge.shorter()
                        if short_edge in forest.short_edges:
                            edge.same = forest.short_edges[short_edge]
                            if use_same:
                                edge.fvector += edge.same.fvector
                        else:
                            forest.short_edges[short_edge] = edge

                    node.add_edge(edge)
                    if is_oracle:
                        node.oracle_edge = edge

                    
                if node.sp_terminal():
                    node.word = node.edges[0].subs[0].word

            ## splitted nodes 12-3-4 => (12, 3, 4)
            tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder])   
            forest.nodeorder = [x for (_, x) in tmp]

            forest.rehash()
            sentid += 1
            
##            print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \
##                  % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime)

            forest.root = node
            node.set_root(True)

            line = file.readline()

            if line is not None and line.strip() != "":
                if line[0] == "(":
                    forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=True)
                    line = file.readline()
            else:
                line = None

            total_time += time.time() - start_time

            if num_sents % 100 == 0:
                print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \
                      % (num_sents, total_time/num_sents)
                
            yield forest

        Forest.load_time = total_time
        print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \
              % (num_sents, total_time, total_time/num_sents)
from collections import defaultdict
from tree import Tree
from pprint import pprint
import sys

fin = sys.stdin
fout = sys.stdout
ferr = sys.stderr

trees = [line.strip() for line in fin.readlines()]

for i, tree in enumerate(trees):
    trees[i] = Tree.parse(tree.strip())

gcounts = defaultdict(lambda: defaultdict(lambda: float(0)))


def getcfgcounts(tree, gcounts):
    if tree.word == None:
        for t in tree.subs:
            getcfgcounts(t, gcounts)
        if len(tree.subs) == 2:
            children = ''
            for t in tree.subs:
                children += ' ' + t.label
            children = children.strip()
            gcounts[tree.label][children] += 1
        elif len(tree.subs) == 1:
            gcounts[tree.label][tree.subs[0].label] += 1
        else:
            print('something odd here! tree: {} gcounts:{}'.format(