Esempio n. 1
0
def read_features(file):
  feature2ind = {}
  ind2feature = {}
  features = utility.getfile(file, 0)
  for l in features:
    i, feature = l.strip().split("\t")
    feature2ind[feature] = int(i)
    ind2feature[int(i)] = feature
  return (feature2ind, ind2feature)
Esempio n. 2
0
def readonebest(f):
	'''1-best output, or gold'''
							 
	f = getfile(f)
	while True:
		line = f.readline()
		if line == '':
			break
		if line == '\n':
			continue

		yield Tree.parse(line.strip(), trunc=True, lower=True)
Esempio n. 3
0
def main():
  from ngram import Ngram
  from model import Model
  from forest import Forest
  
  flags.DEFINE_integer("beam", 100, "beam size", short_name="b")
  flags.DEFINE_integer("debuglevel", 0, "debug level")
  flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)")
  flags.DEFINE_boolean("cube", True, "using cube pruning to speedup")
  flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k")
  flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r")
  

  argv = FLAGS(sys.argv)
  [outfile] = argv[1:]
  weights = Model.cmdline_model()
  lm = Ngram.cmdline_ngram()
  

  false_decoder = CYKDecoder(weights, lm)
  out = utility.getfile(outfile, 1)
  old_bleu = Bleu()
  new_bleu = Bleu()
  
  for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1):
    
    oracle_forest, oracle_item = oracle_extracter(forest, weights, false_decoder, 100, 2, extract=100)
    print >>sys.stderr, "processed sent %s " % i
    oracle_forest.dump(out)
    bleu, hyp, fv, edgelist = forest.compute_oracle(weights, 0.0, 1)

    forest.bleu.rescore(hyp)
    old_bleu += forest.bleu
    forest.bleu.rescore(oracle_item[0].full_derivation)
    new_bleu += forest.bleu

    bad_bleu, _, _, _ = oracle_forest.compute_oracle(weights, 0.0, -1)
    #for i in range(min(len(oracle_item), 5)):
     # print >>sys.stderr, "Oracle Trans: %s %s %s" %(oracle_item[i].full_derivation, oracle_item[i].score, str(oracle_item[i].score[2]))
     # print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[i].full_derivation))
    print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[0].full_derivation))
    print >>sys.stderr, "Worst new Oracle BLEU Score: %s"% (bad_bleu)
    print >>sys.stderr, "Old Oracle BLEU Score: %s"% (bleu)
    
    print >>sys.stderr, "Running Oracle BLEU Score: %s"% (new_bleu.compute_score())
    print >>sys.stderr, "Running Old Oracle BLEU Score: %s"% (old_bleu.compute_score())
Esempio n. 4
0
def readkbest(f, read_gold=False):

	f = getfile(f)
	while True: #now < len(lines):
		line = f.readline() #lines[now]
		if line == '':
			break
		if line == '\n':
			continue
		try:
			k, tag = line.strip().split("\t")
			k = int(k)
		except:
			break  ## can finish earlier

		kparses = []
		for stuff in readkparses(f, int(k)):
			kparses.append(stuff)
			
		goldtree = Tree.parse(f.readline().strip(), trunc=True, lower=True) if read_gold \
				   else None
 		yield NBestForest(k, tag, kparses, goldtree)
Esempio n. 5
0
def main():
  flags.DEFINE_boolean("feature_map", False, "create a feature map for all features in the data")  
  argv = FLAGS(sys.argv)
  
  (trans_forest_filename, with_feature_filename) = argv[1:]
  trans_forests = forest.Forest.load(trans_forest_filename, True)
  tdm = TargetDataManager()
  feature_adder = FeatureAdder(tdm)
  outfile = utility.getfile(with_feature_filename, 1)
  allfeats = set()
  allfeats.add("lm1")
  allfeats.add("lm")
  for i, tforest in enumerate(trans_forests, 1):
    print >>sys.stderr, "processed sent %s " % i
    
    if FLAGS.feature_map:
      allfeats |= feature_adder.add_features(tforest, just_list = True)
    else:
      feature_adder.add_features(tforest)
      tforest.dump(outfile)

  if FLAGS.feature_map:
    for i, feat in enumerate(allfeats):
      print >>outfile, str(i)+ "\t" + feat
Esempio n. 6
0
    def load(filename, lower=True, sentid=0):
        '''now return a generator! use load().next() for singleton.
           and read the last line as the gold tree -- TODO: optional!
           and there is an empty line at the end
        '''

        file = getfile(filename)
        line = None
        total_time = 0
        num_sents = 0
        
        while True:            
            
            start_time = time.time()
            ##'\tThe complicated language in ...\n"
            ## tag is often missing
            try:
                if line is None or line == "\n":
                    line = "\n"
                    while line == "\n":
                        line = file.readline()  # emulate seek                    
                tag, sent = line.split("\t")
            except:
                ## no more forests
                break

            num_sents += 1
            
            sent = sent.split()
            cased_sent = sent [:]
            if lower:
                sent = [w.lower() for w in sent]   # mark johnson: lowercase all words
            num = int(file.readline())

            forest = Forest(num, sent, cased_sent, tag)
            forest.labelspans = {}
            forest.short_edges = {}

            delta = num_spu = 0
            for i in xrange(1, num+1):

                ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n'
                ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ...
                line = file.readline()
                try:
                    keys, fields = line.split(" ||| ")
                except:
                    keys = line
                    fields = ""


                iden, labelspan, size = keys.split("\t") ## iden can be non-ints
                size = int(size)

                fvector = FVector.parse(fields)
                node = Node(iden, labelspan, size, fvector, sent)
                forest.add_node(node)

                if cache_same:
                    if labelspan in forest.labelspans:
                        node.same = forest.labelspans[labelspan]
                        node.fvector = node.same.fvector
                    else:
                        forest.labelspans[labelspan] = node

                for j in xrange(size):
                    is_oracle = False

                    ## '\t1 ||| 0=8.86276 1=2 3\n'
                    tails, fields = file.readline().strip().split(" ||| ")
                    
                    if tails[0] == "*":  #oracle edge
                        is_oracle = True
                        tails = tails[1:]
                        
                    tails = tails.split() ## could be non-integers
                    tailnodes = []

                    for x in tails:
                        assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \
                               "(in a hyperedge of node #%s) before being defined" % iden
                        ## topological ordering
                        tail = forest.nodes[x]
                        tailnodes.append(tail)

                    use_same = False
                    if fields[-1] == "~":
                        use_same = True
                        fields = fields[:-1]
                        
                    fvector = FVector.parse(fields)
                    edge = Hyperedge(node, tailnodes, fvector)

                    if cache_same:

                        short_edge = edge.shorter()
                        if short_edge in forest.short_edges:
                            edge.same = forest.short_edges[short_edge]
                            if use_same:
                                edge.fvector += edge.same.fvector
                        else:
                            forest.short_edges[short_edge] = edge

                    node.add_edge(edge)
                    if is_oracle:
                        node.oracle_edge = edge

                    
                if node.sp_terminal():
                    node.word = node.edges[0].subs[0].word

            ## splitted nodes 12-3-4 => (12, 3, 4)
            tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder])   
            forest.nodeorder = [x for (_, x) in tmp]

            forest.rehash()
            sentid += 1
            
##            print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \
##                  % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime)

            forest.root = node
            node.set_root(True)

            line = file.readline()

            if line is not None and line.strip() != "":
                if line[0] == "(":
                    forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=True)
                    line = file.readline()
            else:
                line = None

            total_time += time.time() - start_time

            if num_sents % 100 == 0:
                print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \
                      % (num_sents, total_time/num_sents)
                
            yield forest

        Forest.load_time = total_time
        print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \
              % (num_sents, total_time, total_time/num_sents)
Esempio n. 7
0
    def load(filename, lower=False, sentid=0):
        '''now return a generator! use load().next() for singleton.
           and read the last line as the gold tree -- TODO: optional!
           and there is an empty line at the end
        '''

        file = getfile(filename)

        line = None
        total_time = 0
        num_sents = 0

        while True:

            start_time = time.time()
            ##'\tThe complicated language in ...\n"
            ## tag is often missing
            try:
                if line is None or line == "\n":
                    line = "\n"
                    while line == "\n":
                        line = file.readline()  # emulate seek
                tag, sent = line.split("\t")
            except:
                ## no more forests
                break

            num_sents += 1

            sent = sent.split()
            cased_sent = sent[:]
            if lower:
                sent = [w.lower()
                        for w in sent]  # mark johnson: lowercase all words
            num = int(file.readline())

            forest = Forest(num, sent, cased_sent, tag)
            forest.labelspans = {}
            forest.short_edges = {}

            delta = num_spu = 0
            for i in xrange(1, num + 1):

                ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n'
                ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ...
                line = file.readline()
                try:
                    keys, fields = line.split(" ||| ")
                except:
                    keys = line
                    fields = ""

                iden, labelspan, size = keys.split(
                    "\t")  ## iden can be non-ints
                size = int(size)

                fvector = FVector(fields)  # TODO: myvector
                node = Node(iden, labelspan, size, fvector, sent)
                forest.add_node(node)

                if cache_same:
                    if labelspan in forest.labelspans:
                        node.same = forest.labelspans[labelspan]
                        node.fvector = node.same.fvector
                    else:
                        forest.labelspans[labelspan] = node

                for j in xrange(size):
                    is_oracle = False

                    ## '\t1 ||| 0=8.86276 1=2 3\n'
                    tails, fields = file.readline().strip().split(" ||| ")

                    if tails[0] == "*":  #oracle edge
                        is_oracle = True
                        tails = tails[1:]

                    tails = tails.split()  ## could be non-integers
                    tailnodes = []

                    for x in tails:
                        assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \
                               "(in a hyperedge of node #%s) before being defined" % iden
                        ## topological ordering
                        tail = forest.nodes[x]
                        tailnodes.append(tail)

                    use_same = False
                    if fields[-1] == "~":
                        use_same = True
                        fields = fields[:-1]

                    fvector = FVector(fields)
                    edge = Hyperedge(node, tailnodes, fvector)

                    if cache_same:

                        short_edge = edge.shorter()
                        if short_edge in forest.short_edges:
                            edge.same = forest.short_edges[short_edge]
                            if use_same:
                                edge.fvector += edge.same.fvector
                        else:
                            forest.short_edges[short_edge] = edge

                    node.add_edge(edge)
                    if is_oracle:
                        node.oracle_edge = edge

                if node.sp_terminal():
                    node.word = node.edges[0].subs[0].word

            ## splitted nodes 12-3-4 => (12, 3, 4)
            tmp = sorted([(map(int, x.iden.split("-")), x)
                          for x in forest.nodeorder])
            forest.nodeorder = [x for (_, x) in tmp]

            forest.rehash()
            sentid += 1

            ##          print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \
            ##                % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime)

            forest.root = node
            node.set_root(True)

            line = file.readline()

            if line is not None and line.strip() != "":
                if line[0] == "(":
                    ##                    forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=False)
                    line = file.readline()
            else:
                line = None

            total_time += time.time() - start_time

            if num_sents % 100 == 0:
                print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \
                      % (num_sents, total_time/num_sents)

            yield forest

        Forest.load_time = total_time
        if num_sents > 0:
            print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \
                  % (num_sents, total_time, total_time/num_sents)
Esempio n. 8
0
	def readweights(filename):
		'''read the first line only. weights must start with a "W" '''
		line = getfile(filename).readline()
		if line[0] == "W":
			return FVector.parse(line[1:])
		return None
Esempio n. 9
0
 def readweights(filename):
     '''read the first line only. weights must start with a "W" '''
     line = getfile(filename).readline()
     if line[0] == "W":
         return FVector.parse(line[1:])
     return None
Esempio n. 10
0
    def load(filename, is_tforest=False, lower=False, sentid=0, first=None, lm=None):
        '''now returns a generator! use load().next() for singleton.
           and read the last line as the gold tree -- TODO: optional!
           and there is an empty line at the end
        '''
        if first is None: # N.B.: must be here, not in the param line (after program initializes)
            first = FLAGS.first
            
        file = getfile(filename)
        line = None
        total_time = 0
        num_sents = 0        
        
        while True:            
            
            start_time = time.time()
            ##'\tThe complicated language in ...\n"
            ## tag is often missing
            line = file.readline()  # emulate seek
            if len(line) == 0:
                break
            try:
                ## strict format, no consecutive breaks
#                 if line is None or line == "\n":
#                     line = "\n"
#                     while line == "\n":
#                         line = file.readline()  # emulate seek
                        
                tag, sent = line.split("\t")   # foreign sentence
                
            except:
                ## no more forests
                yield None
                continue

            num_sents += 1

            # caching the original, word-based, true-case sentence
            sent = sent.split() ## no splitting with " "
            cased_sent = sent [:]            
            if lower:
                sent = [w.lower() for w in sent]   # mark johnson: lowercase all words

            #sent = words_to_chars(sent, encode_back=True)  # split to chars

            ## read in references
            refnum = int(file.readline().strip())
            refs = []
            for i in xrange(refnum):
                refs.append(file.readline().strip())

            ## sizes: number of nodes, number of edges (optional)
            num, nedges = map(int, file.readline().split("\t"))   

            forest = Forest(sent, cased_sent, tag, is_tforest)

            forest.tag = tag

            forest.refs = refs
            forest.bleu = Bleu(refs=refs)  ## initial (empty test) bleu; used repeatedly later
            
            forest.labelspans = {}
            forest.short_edges = {}
            forest.rules = {}

            for i in xrange(1, num+1):

                ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n'
                ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ...
                line = file.readline()
                try:
                    keys, fields = line.split(" ||| ")
                except:
                    keys = line
                    fields = ""

                iden, labelspan, size = keys.split("\t") ## iden can be non-ints
                size = int(size)

                fvector = Vector(fields) #
##                remove_blacklist(fvector)
                node = Node(iden, labelspan, size, fvector, sent)
                forest.add_node(node)

                if cache_same:
                    if labelspan in forest.labelspans:
                        node.same = forest.labelspans[labelspan]
                        node.fvector = node.same.fvector
                    else:
                        forest.labelspans[labelspan] = node

                for j in xrange(size):
                    is_oracle = False

                    ## '\t1 ||| 0=8.86276 1=2 3\n'
                    ## N.B.: can't just strip! "\t... ||| ... ||| \n" => 2 fields instead of 3
                    tails, rule, fields = file.readline().strip("\t\n").split(" ||| ")

                    if tails != "" and tails[0] == "*":  #oracle edge
                        is_oracle = True
                        tails = tails[1:]

                    tails = tails.split() ## N.B.: don't split by " "!
                    tailnodes = []
                    lhsstr = [] # 123 "thank" 456

                    lmstr = []
                    lmscore = 0
                    lmlhsstr = []
                    
                    for x in tails:
                        if x[0]=='"': # word
                            word = desymbol(x[1:-1])
                            lhsstr.append(word)  ## desymbol here and only here; ump will call quoteattr
                            
                            if lm is not None:
                                this = lm.word2index(word)
                                lmscore += lm.ngram.wordprob(this, lmstr)
                                lmlhsstr.append(this)
                                lmstr += [this,]
                                
                        else: # variable

                            assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \
                                         "(in a hyperedge of node #%s) before being defined" % iden
                            tail = forest.nodes[x]
                            tailnodes.append(tail)
                            lhsstr.append(tail)                            

                            if lm is not None:
                                lmstr = []  # "..." "..." x0 "..."
                                lmlhsstr.append(tail) # sync with lhsstr

                    fvector = Vector(fields)
                    if lm is not None:
                        fvector["lm1"] = lmscore # hack

                    edge = Hyperedge(node, tailnodes, fvector, lhsstr)
                    edge.lmlhsstr = lmlhsstr

                    ## new
                    x = rule.split()
                    edge.ruleid = int(x[0])
                    if len(x) > 1:
                        edge.rule = Rule.parse(" ".join(x[1:]) + " ### " + fields)
                        forest.rules[edge.ruleid] = edge.rule #" ".join(x[1:]) #, None)
                    else:
                        edge.rule = forest.rules[edge.ruleid] # cahced rule

                    node.add_edge(edge)
                    if is_oracle:
                        node.oracle_edge = edge
                    
                if node.sp_terminal():
                    node.word = node.edges[0].subs[0].word

            ## splitted nodes 12-3-4 => (12, 3, 4)
            tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder])   
            forest.nodeorder = [x for (_, x) in tmp]

            forest.rehash()
            sentid += 1
            
##            print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \
##                  % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime)

            forest.root = node
            node.set_root(True)
            line = file.readline()

            if line is not None and line.strip() != "":
                if line[0] == "(":
                    forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=False)
                    line = file.readline()
            else:
                line = None

            forest.number_nodes()
            #print forest.root.position_id
          

            total_time += time.time() - start_time

            if num_sents % 100 == 0:
                print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \
                      % (num_sents, total_time/num_sents)

            forest.subtree() #compute the subtree string for each node

            yield forest

            if first is not None and num_sents >= first:
                break                

        # better check here instead of zero-division exception
        if num_sents == 0:
            print >> logs, "NO FORESTS FOUND!!! (empty input file?)"
            sys.exit(1)            
#            yield None # new: don't halt -- WHY?
        
        Forest.load_time = total_time
        print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \
              % (num_sents, total_time, total_time/(num_sents+0.001))