コード例 #1
0
ファイル: forest.py プロジェクト: rupenp/transforest
    def load(filename, lower=True, sentid=0):
        '''now return a generator! use load().next() for singleton.
           and read the last line as the gold tree -- TODO: optional!
           and there is an empty line at the end
        '''

        file = getfile(filename)
        line = None
        total_time = 0
        num_sents = 0
        
        while True:            
            
            start_time = time.time()
            ##'\tThe complicated language in ...\n"
            ## tag is often missing
            try:
                if line is None or line == "\n":
                    line = "\n"
                    while line == "\n":
                        line = file.readline()  # emulate seek                    
                tag, sent = line.split("\t")
            except:
                ## no more forests
                break

            num_sents += 1
            
            sent = sent.split()
            cased_sent = sent [:]
            if lower:
                sent = [w.lower() for w in sent]   # mark johnson: lowercase all words
            num = int(file.readline())

            forest = Forest(num, sent, cased_sent, tag)
            forest.labelspans = {}
            forest.short_edges = {}

            delta = num_spu = 0
            for i in xrange(1, num+1):

                ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n'
                ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ...
                line = file.readline()
                try:
                    keys, fields = line.split(" ||| ")
                except:
                    keys = line
                    fields = ""


                iden, labelspan, size = keys.split("\t") ## iden can be non-ints
                size = int(size)

                fvector = FVector.parse(fields)
                node = Node(iden, labelspan, size, fvector, sent)
                forest.add_node(node)

                if cache_same:
                    if labelspan in forest.labelspans:
                        node.same = forest.labelspans[labelspan]
                        node.fvector = node.same.fvector
                    else:
                        forest.labelspans[labelspan] = node

                for j in xrange(size):
                    is_oracle = False

                    ## '\t1 ||| 0=8.86276 1=2 3\n'
                    tails, fields = file.readline().strip().split(" ||| ")
                    
                    if tails[0] == "*":  #oracle edge
                        is_oracle = True
                        tails = tails[1:]
                        
                    tails = tails.split() ## could be non-integers
                    tailnodes = []

                    for x in tails:
                        assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \
                               "(in a hyperedge of node #%s) before being defined" % iden
                        ## topological ordering
                        tail = forest.nodes[x]
                        tailnodes.append(tail)

                    use_same = False
                    if fields[-1] == "~":
                        use_same = True
                        fields = fields[:-1]
                        
                    fvector = FVector.parse(fields)
                    edge = Hyperedge(node, tailnodes, fvector)

                    if cache_same:

                        short_edge = edge.shorter()
                        if short_edge in forest.short_edges:
                            edge.same = forest.short_edges[short_edge]
                            if use_same:
                                edge.fvector += edge.same.fvector
                        else:
                            forest.short_edges[short_edge] = edge

                    node.add_edge(edge)
                    if is_oracle:
                        node.oracle_edge = edge

                    
                if node.sp_terminal():
                    node.word = node.edges[0].subs[0].word

            ## splitted nodes 12-3-4 => (12, 3, 4)
            tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder])   
            forest.nodeorder = [x for (_, x) in tmp]

            forest.rehash()
            sentid += 1
            
##            print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \
##                  % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime)

            forest.root = node
            node.set_root(True)

            line = file.readline()

            if line is not None and line.strip() != "":
                if line[0] == "(":
                    forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=True)
                    line = file.readline()
            else:
                line = None

            total_time += time.time() - start_time

            if num_sents % 100 == 0:
                print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \
                      % (num_sents, total_time/num_sents)
                
            yield forest

        Forest.load_time = total_time
        print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \
              % (num_sents, total_time, total_time/num_sents)
コード例 #2
0
ファイル: forest.py プロジェクト: zhangxt/lineardpparser
    def load(filename, lower=False, sentid=0):
        '''now return a generator! use load().next() for singleton.
           and read the last line as the gold tree -- TODO: optional!
           and there is an empty line at the end
        '''

        file = getfile(filename)

        line = None
        total_time = 0
        num_sents = 0

        while True:

            start_time = time.time()
            ##'\tThe complicated language in ...\n"
            ## tag is often missing
            try:
                if line is None or line == "\n":
                    line = "\n"
                    while line == "\n":
                        line = file.readline()  # emulate seek
                tag, sent = line.split("\t")
            except:
                ## no more forests
                break

            num_sents += 1

            sent = sent.split()
            cased_sent = sent[:]
            if lower:
                sent = [w.lower()
                        for w in sent]  # mark johnson: lowercase all words
            num = int(file.readline())

            forest = Forest(num, sent, cased_sent, tag)
            forest.labelspans = {}
            forest.short_edges = {}

            delta = num_spu = 0
            for i in xrange(1, num + 1):

                ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n'
                ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ...
                line = file.readline()
                try:
                    keys, fields = line.split(" ||| ")
                except:
                    keys = line
                    fields = ""

                iden, labelspan, size = keys.split(
                    "\t")  ## iden can be non-ints
                size = int(size)

                fvector = FVector(fields)  # TODO: myvector
                node = Node(iden, labelspan, size, fvector, sent)
                forest.add_node(node)

                if cache_same:
                    if labelspan in forest.labelspans:
                        node.same = forest.labelspans[labelspan]
                        node.fvector = node.same.fvector
                    else:
                        forest.labelspans[labelspan] = node

                for j in xrange(size):
                    is_oracle = False

                    ## '\t1 ||| 0=8.86276 1=2 3\n'
                    tails, fields = file.readline().strip().split(" ||| ")

                    if tails[0] == "*":  #oracle edge
                        is_oracle = True
                        tails = tails[1:]

                    tails = tails.split()  ## could be non-integers
                    tailnodes = []

                    for x in tails:
                        assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \
                               "(in a hyperedge of node #%s) before being defined" % iden
                        ## topological ordering
                        tail = forest.nodes[x]
                        tailnodes.append(tail)

                    use_same = False
                    if fields[-1] == "~":
                        use_same = True
                        fields = fields[:-1]

                    fvector = FVector(fields)
                    edge = Hyperedge(node, tailnodes, fvector)

                    if cache_same:

                        short_edge = edge.shorter()
                        if short_edge in forest.short_edges:
                            edge.same = forest.short_edges[short_edge]
                            if use_same:
                                edge.fvector += edge.same.fvector
                        else:
                            forest.short_edges[short_edge] = edge

                    node.add_edge(edge)
                    if is_oracle:
                        node.oracle_edge = edge

                if node.sp_terminal():
                    node.word = node.edges[0].subs[0].word

            ## splitted nodes 12-3-4 => (12, 3, 4)
            tmp = sorted([(map(int, x.iden.split("-")), x)
                          for x in forest.nodeorder])
            forest.nodeorder = [x for (_, x) in tmp]

            forest.rehash()
            sentid += 1

            ##          print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \
            ##                % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime)

            forest.root = node
            node.set_root(True)

            line = file.readline()

            if line is not None and line.strip() != "":
                if line[0] == "(":
                    ##                    forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=False)
                    line = file.readline()
            else:
                line = None

            total_time += time.time() - start_time

            if num_sents % 100 == 0:
                print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \
                      % (num_sents, total_time/num_sents)

            yield forest

        Forest.load_time = total_time
        if num_sents > 0:
            print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \
                  % (num_sents, total_time, total_time/num_sents)