def load(filename): '''small.13.1 7 50 #0 5 7 0=-42.9527 ... ... N.B. a dummy TAB between sizes and fvector, sorry. ''' total_time = 0 num_sents = 0 f = getfile(filename) while True: #now < len(lines): start_time = time.time() line = f.readline() #lines[now] if line == '': break num_sents += 1 ## print >> logs, line, tag, goldsize, k = line.split("\t") goldsize = int(goldsize) k = int(k) kparses = [] best_pp = None ## CAREFUL! could be 0 for i in xrange(k): sentid, sizes, _, fv = f.readline().split("\t") matchbr, testbr = map(int, sizes.split()) fvector = FVector.parse(fv) pp = Parseval.get_parseval(matchbr, testbr, goldsize) curr = [fvector, pp] kparses.append(curr) if best_pp is None or pp < best_pp: ## < is better in oracle best_pp = pp oracle = curr oracle_testbr = testbr forest = NBestList(k, tag, kparses, goldsize) forest.oracle_tree = oracle forest.oracle_fvector, forest.oracle_pp = oracle if Decoder.MAX_NUM_BRACKETS < 0: forest.oracle_size_ratio = 1 else: forest.oracle_size_ratio = oracle_testbr / Decoder.MAX_NUM_BRACKETS total_time += time.time() - start_time yield forest NBestList.load_time = total_time print >> logs, "%d nbest lists loaded in %.2lf secs (avg %.2lf per sent)" \ % (num_sents, total_time, total_time/num_sents)
def forest_oracle(forest, goldtree, del_puncs=False, prune_results=False): """ returns best_score, best_parseval, best_tree, edgelist now non-recursive topol-sort-style """ if hasattr(forest.root, "oracle_edge"): return extract_oracle(forest) ## modifies forest also!! if del_puncs: idx_mapping, newforest = check_puncs(forest, goldtree.tag_seq) else: idx_mapping, newforest = lambda x: x, forest goldspans = merge_labels(goldtree.all_label_spans(), idx_mapping) goldbrs = set(goldspans) ## including TOP for node in newforest: if node.is_terminal(): results = Oracles.unit("(%s %s)" % (node.label, node.word)) ## multiplication unit else: a, b = ( (0, 0) if node.is_spurious() else ((1, 1) if (merge_label((node.label, node.span), idx_mapping) in goldbrs) else (1, 0)) ) label = "" if node.is_spurious() else node.label results = Oracles() ## addition unit for edge in node.edges: edgeres = Oracles.unit() ## multiplication unit for sub in edge.subs: assert hasattr(sub, "oracles"), "%s ; %s ; %s" % (node, sub, edge) edgeres = edgeres * sub.oracles ## nodehead = (a, RES((b, -edge.fvector[0], label, [edge]))) ## originally there is label assert 0 in edge.fvector, edge nodehead = (a, RES((b, -edge.fvector[0], [edge]))) results += nodehead * edgeres ## mul if prune_results: prune(results) node.oracles = results if debug: print >> logs, node.labelspan(), "\n", results, "----------" res = (-1, RES((-1, 0, []))) * newforest.root.oracles ## scale, remove TOP match num_gold = len(goldspans) - 1 ## omit TOP. N.B. goldspans, not brackets! (NP (NP ...)) best_parseval = None for num_test in res: ## num_matched, score, tree_str, edgelist = res[num_test] num_matched, score, edgelist = res[num_test] this = Parseval.get_parseval(num_matched, num_test, num_gold) if best_parseval is None or this < best_parseval: best_parseval = this best_score = score ## best_tree = tree_str best_edgelist = edgelist best_tree = Hyperedge.deriv2tree(best_edgelist) ## annotate the forest for oracle so that next-time you can preload oracle for edge in best_edgelist: edge.head.oracle_edge = edge ## very careful here: desymbol ! ## return -best_score, best_parseval, Tree.parse(desymbol(best_tree)), best_edgelist return -best_score, best_parseval, best_tree, best_edgelist