def read_features(file): feature2ind = {} ind2feature = {} features = utility.getfile(file, 0) for l in features: i, feature = l.strip().split("\t") feature2ind[feature] = int(i) ind2feature[int(i)] = feature return (feature2ind, ind2feature)
def readonebest(f): '''1-best output, or gold''' f = getfile(f) while True: line = f.readline() if line == '': break if line == '\n': continue yield Tree.parse(line.strip(), trunc=True, lower=True)
def main(): from ngram import Ngram from model import Model from forest import Forest flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") argv = FLAGS(sys.argv) [outfile] = argv[1:] weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() false_decoder = CYKDecoder(weights, lm) out = utility.getfile(outfile, 1) old_bleu = Bleu() new_bleu = Bleu() for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1): oracle_forest, oracle_item = oracle_extracter(forest, weights, false_decoder, 100, 2, extract=100) print >>sys.stderr, "processed sent %s " % i oracle_forest.dump(out) bleu, hyp, fv, edgelist = forest.compute_oracle(weights, 0.0, 1) forest.bleu.rescore(hyp) old_bleu += forest.bleu forest.bleu.rescore(oracle_item[0].full_derivation) new_bleu += forest.bleu bad_bleu, _, _, _ = oracle_forest.compute_oracle(weights, 0.0, -1) #for i in range(min(len(oracle_item), 5)): # print >>sys.stderr, "Oracle Trans: %s %s %s" %(oracle_item[i].full_derivation, oracle_item[i].score, str(oracle_item[i].score[2])) # print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[i].full_derivation)) print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[0].full_derivation)) print >>sys.stderr, "Worst new Oracle BLEU Score: %s"% (bad_bleu) print >>sys.stderr, "Old Oracle BLEU Score: %s"% (bleu) print >>sys.stderr, "Running Oracle BLEU Score: %s"% (new_bleu.compute_score()) print >>sys.stderr, "Running Old Oracle BLEU Score: %s"% (old_bleu.compute_score())
def readkbest(f, read_gold=False): f = getfile(f) while True: #now < len(lines): line = f.readline() #lines[now] if line == '': break if line == '\n': continue try: k, tag = line.strip().split("\t") k = int(k) except: break ## can finish earlier kparses = [] for stuff in readkparses(f, int(k)): kparses.append(stuff) goldtree = Tree.parse(f.readline().strip(), trunc=True, lower=True) if read_gold \ else None yield NBestForest(k, tag, kparses, goldtree)
def main(): flags.DEFINE_boolean("feature_map", False, "create a feature map for all features in the data") argv = FLAGS(sys.argv) (trans_forest_filename, with_feature_filename) = argv[1:] trans_forests = forest.Forest.load(trans_forest_filename, True) tdm = TargetDataManager() feature_adder = FeatureAdder(tdm) outfile = utility.getfile(with_feature_filename, 1) allfeats = set() allfeats.add("lm1") allfeats.add("lm") for i, tforest in enumerate(trans_forests, 1): print >>sys.stderr, "processed sent %s " % i if FLAGS.feature_map: allfeats |= feature_adder.add_features(tforest, just_list = True) else: feature_adder.add_features(tforest) tforest.dump(outfile) if FLAGS.feature_map: for i, feat in enumerate(allfeats): print >>outfile, str(i)+ "\t" + feat
def load(filename, lower=True, sentid=0): '''now return a generator! use load().next() for singleton. and read the last line as the gold tree -- TODO: optional! and there is an empty line at the end ''' file = getfile(filename) line = None total_time = 0 num_sents = 0 while True: start_time = time.time() ##'\tThe complicated language in ...\n" ## tag is often missing try: if line is None or line == "\n": line = "\n" while line == "\n": line = file.readline() # emulate seek tag, sent = line.split("\t") except: ## no more forests break num_sents += 1 sent = sent.split() cased_sent = sent [:] if lower: sent = [w.lower() for w in sent] # mark johnson: lowercase all words num = int(file.readline()) forest = Forest(num, sent, cased_sent, tag) forest.labelspans = {} forest.short_edges = {} delta = num_spu = 0 for i in xrange(1, num+1): ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n' ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ... line = file.readline() try: keys, fields = line.split(" ||| ") except: keys = line fields = "" iden, labelspan, size = keys.split("\t") ## iden can be non-ints size = int(size) fvector = FVector.parse(fields) node = Node(iden, labelspan, size, fvector, sent) forest.add_node(node) if cache_same: if labelspan in forest.labelspans: node.same = forest.labelspans[labelspan] node.fvector = node.same.fvector else: forest.labelspans[labelspan] = node for j in xrange(size): is_oracle = False ## '\t1 ||| 0=8.86276 1=2 3\n' tails, fields = file.readline().strip().split(" ||| ") if tails[0] == "*": #oracle edge is_oracle = True tails = tails[1:] tails = tails.split() ## could be non-integers tailnodes = [] for x in tails: assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \ "(in a hyperedge of node #%s) before being defined" % iden ## topological ordering tail = forest.nodes[x] tailnodes.append(tail) use_same = False if fields[-1] == "~": use_same = True fields = fields[:-1] fvector = FVector.parse(fields) edge = Hyperedge(node, tailnodes, fvector) if cache_same: short_edge = edge.shorter() if short_edge in forest.short_edges: edge.same = forest.short_edges[short_edge] if use_same: edge.fvector += edge.same.fvector else: forest.short_edges[short_edge] = edge node.add_edge(edge) if is_oracle: node.oracle_edge = edge if node.sp_terminal(): node.word = node.edges[0].subs[0].word ## splitted nodes 12-3-4 => (12, 3, 4) tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder]) forest.nodeorder = [x for (_, x) in tmp] forest.rehash() sentid += 1 ## print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \ ## % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime) forest.root = node node.set_root(True) line = file.readline() if line is not None and line.strip() != "": if line[0] == "(": forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=True) line = file.readline() else: line = None total_time += time.time() - start_time if num_sents % 100 == 0: print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \ % (num_sents, total_time/num_sents) yield forest Forest.load_time = total_time print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \ % (num_sents, total_time, total_time/num_sents)
def load(filename, lower=False, sentid=0): '''now return a generator! use load().next() for singleton. and read the last line as the gold tree -- TODO: optional! and there is an empty line at the end ''' file = getfile(filename) line = None total_time = 0 num_sents = 0 while True: start_time = time.time() ##'\tThe complicated language in ...\n" ## tag is often missing try: if line is None or line == "\n": line = "\n" while line == "\n": line = file.readline() # emulate seek tag, sent = line.split("\t") except: ## no more forests break num_sents += 1 sent = sent.split() cased_sent = sent[:] if lower: sent = [w.lower() for w in sent] # mark johnson: lowercase all words num = int(file.readline()) forest = Forest(num, sent, cased_sent, tag) forest.labelspans = {} forest.short_edges = {} delta = num_spu = 0 for i in xrange(1, num + 1): ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n' ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ... line = file.readline() try: keys, fields = line.split(" ||| ") except: keys = line fields = "" iden, labelspan, size = keys.split( "\t") ## iden can be non-ints size = int(size) fvector = FVector(fields) # TODO: myvector node = Node(iden, labelspan, size, fvector, sent) forest.add_node(node) if cache_same: if labelspan in forest.labelspans: node.same = forest.labelspans[labelspan] node.fvector = node.same.fvector else: forest.labelspans[labelspan] = node for j in xrange(size): is_oracle = False ## '\t1 ||| 0=8.86276 1=2 3\n' tails, fields = file.readline().strip().split(" ||| ") if tails[0] == "*": #oracle edge is_oracle = True tails = tails[1:] tails = tails.split() ## could be non-integers tailnodes = [] for x in tails: assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \ "(in a hyperedge of node #%s) before being defined" % iden ## topological ordering tail = forest.nodes[x] tailnodes.append(tail) use_same = False if fields[-1] == "~": use_same = True fields = fields[:-1] fvector = FVector(fields) edge = Hyperedge(node, tailnodes, fvector) if cache_same: short_edge = edge.shorter() if short_edge in forest.short_edges: edge.same = forest.short_edges[short_edge] if use_same: edge.fvector += edge.same.fvector else: forest.short_edges[short_edge] = edge node.add_edge(edge) if is_oracle: node.oracle_edge = edge if node.sp_terminal(): node.word = node.edges[0].subs[0].word ## splitted nodes 12-3-4 => (12, 3, 4) tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder]) forest.nodeorder = [x for (_, x) in tmp] forest.rehash() sentid += 1 ## print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \ ## % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime) forest.root = node node.set_root(True) line = file.readline() if line is not None and line.strip() != "": if line[0] == "(": ## forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=False) line = file.readline() else: line = None total_time += time.time() - start_time if num_sents % 100 == 0: print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \ % (num_sents, total_time/num_sents) yield forest Forest.load_time = total_time if num_sents > 0: print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \ % (num_sents, total_time, total_time/num_sents)
def readweights(filename): '''read the first line only. weights must start with a "W" ''' line = getfile(filename).readline() if line[0] == "W": return FVector.parse(line[1:]) return None
def load(filename, is_tforest=False, lower=False, sentid=0, first=None, lm=None): '''now returns a generator! use load().next() for singleton. and read the last line as the gold tree -- TODO: optional! and there is an empty line at the end ''' if first is None: # N.B.: must be here, not in the param line (after program initializes) first = FLAGS.first file = getfile(filename) line = None total_time = 0 num_sents = 0 while True: start_time = time.time() ##'\tThe complicated language in ...\n" ## tag is often missing line = file.readline() # emulate seek if len(line) == 0: break try: ## strict format, no consecutive breaks # if line is None or line == "\n": # line = "\n" # while line == "\n": # line = file.readline() # emulate seek tag, sent = line.split("\t") # foreign sentence except: ## no more forests yield None continue num_sents += 1 # caching the original, word-based, true-case sentence sent = sent.split() ## no splitting with " " cased_sent = sent [:] if lower: sent = [w.lower() for w in sent] # mark johnson: lowercase all words #sent = words_to_chars(sent, encode_back=True) # split to chars ## read in references refnum = int(file.readline().strip()) refs = [] for i in xrange(refnum): refs.append(file.readline().strip()) ## sizes: number of nodes, number of edges (optional) num, nedges = map(int, file.readline().split("\t")) forest = Forest(sent, cased_sent, tag, is_tforest) forest.tag = tag forest.refs = refs forest.bleu = Bleu(refs=refs) ## initial (empty test) bleu; used repeatedly later forest.labelspans = {} forest.short_edges = {} forest.rules = {} for i in xrange(1, num+1): ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n' ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ... line = file.readline() try: keys, fields = line.split(" ||| ") except: keys = line fields = "" iden, labelspan, size = keys.split("\t") ## iden can be non-ints size = int(size) fvector = Vector(fields) # ## remove_blacklist(fvector) node = Node(iden, labelspan, size, fvector, sent) forest.add_node(node) if cache_same: if labelspan in forest.labelspans: node.same = forest.labelspans[labelspan] node.fvector = node.same.fvector else: forest.labelspans[labelspan] = node for j in xrange(size): is_oracle = False ## '\t1 ||| 0=8.86276 1=2 3\n' ## N.B.: can't just strip! "\t... ||| ... ||| \n" => 2 fields instead of 3 tails, rule, fields = file.readline().strip("\t\n").split(" ||| ") if tails != "" and tails[0] == "*": #oracle edge is_oracle = True tails = tails[1:] tails = tails.split() ## N.B.: don't split by " "! tailnodes = [] lhsstr = [] # 123 "thank" 456 lmstr = [] lmscore = 0 lmlhsstr = [] for x in tails: if x[0]=='"': # word word = desymbol(x[1:-1]) lhsstr.append(word) ## desymbol here and only here; ump will call quoteattr if lm is not None: this = lm.word2index(word) lmscore += lm.ngram.wordprob(this, lmstr) lmlhsstr.append(this) lmstr += [this,] else: # variable assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \ "(in a hyperedge of node #%s) before being defined" % iden tail = forest.nodes[x] tailnodes.append(tail) lhsstr.append(tail) if lm is not None: lmstr = [] # "..." "..." x0 "..." lmlhsstr.append(tail) # sync with lhsstr fvector = Vector(fields) if lm is not None: fvector["lm1"] = lmscore # hack edge = Hyperedge(node, tailnodes, fvector, lhsstr) edge.lmlhsstr = lmlhsstr ## new x = rule.split() edge.ruleid = int(x[0]) if len(x) > 1: edge.rule = Rule.parse(" ".join(x[1:]) + " ### " + fields) forest.rules[edge.ruleid] = edge.rule #" ".join(x[1:]) #, None) else: edge.rule = forest.rules[edge.ruleid] # cahced rule node.add_edge(edge) if is_oracle: node.oracle_edge = edge if node.sp_terminal(): node.word = node.edges[0].subs[0].word ## splitted nodes 12-3-4 => (12, 3, 4) tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder]) forest.nodeorder = [x for (_, x) in tmp] forest.rehash() sentid += 1 ## print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \ ## % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime) forest.root = node node.set_root(True) line = file.readline() if line is not None and line.strip() != "": if line[0] == "(": forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=False) line = file.readline() else: line = None forest.number_nodes() #print forest.root.position_id total_time += time.time() - start_time if num_sents % 100 == 0: print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \ % (num_sents, total_time/num_sents) forest.subtree() #compute the subtree string for each node yield forest if first is not None and num_sents >= first: break # better check here instead of zero-division exception if num_sents == 0: print >> logs, "NO FORESTS FOUND!!! (empty input file?)" sys.exit(1) # yield None # new: don't halt -- WHY? Forest.load_time = total_time print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \ % (num_sents, total_time, total_time/(num_sents+0.001))