コード例 #1
0
ファイル: nbestdecoder.py プロジェクト: rupenp/transforest
    def load(filename):
        '''small.13.1      7       50
           #0      5 7           0=-42.9527 ...
           ...
        N.B.  a dummy TAB between sizes and fvector, sorry.
        '''
        
        total_time = 0
        num_sents = 0
        f = getfile(filename)
        while True: #now < len(lines):

            start_time = time.time()
            
            line = f.readline() #lines[now]
            if line == '':
                break

            num_sents += 1
##            print >> logs, line,
            tag, goldsize, k = line.split("\t")
            goldsize = int(goldsize)
            k = int(k)

            kparses = []
            best_pp = None   ## CAREFUL! could be 0
            for i in xrange(k):
                sentid, sizes, _, fv = f.readline().split("\t")
                matchbr, testbr = map(int, sizes.split())
                fvector = FVector.parse(fv)
                pp = Parseval.get_parseval(matchbr, testbr, goldsize)

                curr = [fvector, pp]
                kparses.append(curr)

                if best_pp is None or pp < best_pp:  ## < is better in oracle
                    best_pp = pp
                    oracle = curr
                    oracle_testbr = testbr

            forest = NBestList(k, tag, kparses, goldsize)
            forest.oracle_tree = oracle
            forest.oracle_fvector, forest.oracle_pp = oracle
            
            if Decoder.MAX_NUM_BRACKETS < 0:
                forest.oracle_size_ratio = 1
            else:
                forest.oracle_size_ratio = oracle_testbr / Decoder.MAX_NUM_BRACKETS

            total_time += time.time() - start_time

            yield forest

        NBestList.load_time = total_time
        print >> logs, "%d nbest lists loaded in %.2lf secs (avg %.2lf per sent)" \
              % (num_sents, total_time, total_time/num_sents)
コード例 #2
0
ファイル: oracle.py プロジェクト: rupenp/transforest
def forest_oracle(forest, goldtree, del_puncs=False, prune_results=False):
    """ returns best_score, best_parseval, best_tree, edgelist
           now non-recursive topol-sort-style
    """

    if hasattr(forest.root, "oracle_edge"):
        return extract_oracle(forest)

    ## modifies forest also!!
    if del_puncs:
        idx_mapping, newforest = check_puncs(forest, goldtree.tag_seq)
    else:
        idx_mapping, newforest = lambda x: x, forest

    goldspans = merge_labels(goldtree.all_label_spans(), idx_mapping)
    goldbrs = set(goldspans)  ## including TOP

    for node in newforest:
        if node.is_terminal():
            results = Oracles.unit("(%s %s)" % (node.label, node.word))  ## multiplication unit

        else:
            a, b = (
                (0, 0)
                if node.is_spurious()
                else ((1, 1) if (merge_label((node.label, node.span), idx_mapping) in goldbrs) else (1, 0))
            )

            label = "" if node.is_spurious() else node.label
            results = Oracles()  ## addition unit
            for edge in node.edges:
                edgeres = Oracles.unit()  ## multiplication unit

                for sub in edge.subs:
                    assert hasattr(sub, "oracles"), "%s ; %s ; %s" % (node, sub, edge)
                    edgeres = edgeres * sub.oracles

                ##                nodehead = (a, RES((b, -edge.fvector[0], label, [edge])))   ## originally there is label
                assert 0 in edge.fvector, edge
                nodehead = (a, RES((b, -edge.fvector[0], [edge])))
                results += nodehead * edgeres  ## mul

        if prune_results:
            prune(results)
        node.oracles = results
        if debug:
            print >> logs, node.labelspan(), "\n", results, "----------"

    res = (-1, RES((-1, 0, []))) * newforest.root.oracles  ## scale, remove TOP match

    num_gold = len(goldspans) - 1  ## omit TOP.  N.B. goldspans, not brackets! (NP (NP ...))

    best_parseval = None
    for num_test in res:
        ##        num_matched, score, tree_str, edgelist = res[num_test]
        num_matched, score, edgelist = res[num_test]
        this = Parseval.get_parseval(num_matched, num_test, num_gold)
        if best_parseval is None or this < best_parseval:
            best_parseval = this
            best_score = score
            ##            best_tree = tree_str
            best_edgelist = edgelist

    best_tree = Hyperedge.deriv2tree(best_edgelist)

    ## annotate the forest for oracle so that next-time you can preload oracle
    for edge in best_edgelist:
        edge.head.oracle_edge = edge

    ## very careful here: desymbol !
    ##    return -best_score, best_parseval, Tree.parse(desymbol(best_tree)), best_edgelist
    return -best_score, best_parseval, best_tree, best_edgelist