def evaluate(self, trees): from spanparser.parser import Parser as SParser from spanparser.phrase_tree import FScore const_acc = FScore() disc_const_acc = FScore() seg_acc, span_acc, nucs_acc, labels_acc = FScore(), FScore(), FScore(), FScore() for tree in trees: predicted = None predicted = SParser.parse(tree.sentence, self.fm, self.network) local_acc = predicted.compare(tree) const_acc += local_acc local_disc_acc = compare_disc_constituency(predicted, tree) disc_const_acc += local_disc_acc predicted_rst = const2rst(Tree.parse(str(predicted)), keep_punct=FLAGS.eval_rstpunct) ref_rst = const2rst(Tree.parse(str(tree)), keep_punct=FLAGS.eval_rstpunct) local_accs = evaltrees(predicted_rst, ref_rst) seg_acc += local_accs['segs'] span_acc += local_accs['spans'] nucs_acc += local_accs['nucs'] labels_acc += local_accs['labels'] return {"const": const_acc, "disc_const": disc_const_acc, "seg": seg_acc, "span": span_acc, "nucs": nucs_acc, "labels": labels_acc}
def main(): from spanparser.phrase_tree import PhraseTree, FScore from spanparser.parser import Parser as SParser from spanparser.features import FeatureMapper from spanparser.network import Network from rst_evaluation import const2rst, evaltrees, compare_disc_constituency from tree import Tree SParser.extlabelfeatures = FLAGS.extlabelfeatures training_trees = PhraseTree.load_treefile(FLAGS.train) fm = FeatureMapper(training_trees) network = Network.load(FLAGS.model) print("Loaded model from {}".format(FLAGS.model), file=sys.stderr) trees = PhraseTree.load_treefile(FLAGS.test) print("Evaluating on {}".format(FLAGS.test), file=sys.stderr) const_acc = FScore() dis_const_acc = FScore() seg_acc, span_acc, nucs_acc, labels_acc = FScore(), FScore(), FScore( ), FScore() label_specific_acc = defaultdict(FScore) for i, tree in enumerate(trees): predicted = None predicted = SParser.parse(tree.sentence, fm, network) local_const_acc = predicted.compare(tree) local_dis_const_acc = compare_disc_constituency(predicted, tree) const_acc += local_const_acc dis_const_acc += local_dis_const_acc predicted_rst = const2rst(Tree.parse(str(predicted)), keep_punct=FLAGS.eval_rstpunct) ref_rst = const2rst(Tree.parse(str(tree)), keep_punct=FLAGS.eval_rstpunct) accuracies = evaltrees(predicted_rst, ref_rst, label_specific=True) if FLAGS.verbose: print("######## Tree {} ########".format(i)) print("# input") print(tree.sentence) print("# reference") print(ref_rst.pretty_str()) print("# predicted") print(predicted_rst.pretty_str()) seg_acc += accuracies['segs'] span_acc += accuracies['spans'] nucs_acc += accuracies['nucs'] labels_acc += accuracies['labels'] for label, acc in accuracies['label_specific'].iteritems(): label_specific_acc[label] += acc print('Const {}, DIS:Const {}, Seg {}, Span {}, Nucs {}, Labels {}'.format( const_acc, dis_const_acc, seg_acc, span_acc, nucs_acc, labels_acc), file=sys.stderr) print('----------- label-specific accuracies -----------', file=sys.stderr) for label, acc in label_specific_acc.iteritems(): print(label, acc, file=sys.stderr)
def main(): for line in sys.stdin: tree = treelib.parse(line) tree = binarize(tree) print(tree)
def readkparses(f, k): for j in xrange(k): fvector = FVector.parse(f.readline().strip()) #float(f.readline().strip()) parse = f.readline().strip() tree = Tree.parse(parse, trunc=True, lower=True) yield (fvector, tree)
def check_in_language(tree_str, conf_file): cfg_f, cfg_r = read_features(conf_file) sent_t = Tree.parse(tree_str.strip(), trunc=False) symbols, all_constraints = get_sat_constraints(sent_t, cfg_f, cfg_r, {}, []) problem = And(all_constraints) model = get_model(problem) if model: return tree_to_str(sent_t) else: return None
def readonebest(f): '''1-best output, or gold''' f = getfile(f) while True: line = f.readline() if line == '': break if line == '\n': continue yield Tree.parse(line.strip(), trunc=True, lower=True)
def load_data_file(self, file_path, word_counter): data = [] f = open(file_path, 'r') for line in f: line_split = line.strip().split('\t') gold_label = line_split[0] if self.relations.has_key(gold_label): premise = line_split[1].strip().split() hypothese = line_split[2].strip().split() if self.lower_case: premise = [p.lower() for p in premise] hypothese = [h.lower() for h in hypothese] for p in premise: self.inc_word_counts(p, word_counter) for h in hypothese: self.inc_word_counts(h, word_counter) p_tree_str = ' '.join(premise) h_tree_str = ' '.join(hypothese) p_tree = Tree() h_tree = Tree() p_tree.parse(p_tree_str) h_tree.parse(h_tree_str) data.append({ 'label': self.relations[gold_label], 'id': len(data), # 'premise': ptree_str, 'p_tree': p_tree, # 'hypothese': htree_str, 'h_tree': h_tree }) else: # printerr('Error loading' + line) pass return data
def main(): label_counts = {} pcfg = {} for line in sys.stdin: tree = treelib.parse(line) update_counts(tree, label_counts, pcfg) normalize_pcfg(pcfg, label_counts) print_pcfg(pcfg)
def binarizeTree(tree): if tree.word == None: for t in tree.subs: binarizeTree(t) print('tree here: {} subs: {}'.format(tree, tree.subs)) if len(tree.subs) > 2: newlabel = tree.label + "'" temptreestr = '(' + newlabel + ' ' + ' '.join( [t.dostr() for t in tree.subs[1:]]) + ')' print('subs: {} new label: {} temptreestr: {}'.format( tree.subs, newlabel, temptreestr)) temptree = Tree.parse(temptreestr) binarizeTree(temptree) ## tree = '('+tree.label+' '+temptree.dostr()+')' print('new original tree: {}'.format(tree)) return tree
def main(): word_counts = {} trees = [] for line in sys.stdin: tree = treelib.parse(line) trees.append(tree) for word in get_words(tree, []): if word not in word_counts: word_counts[word] = 1 else: word_counts[word] += 1 for tree in trees: print(replace_onecount(tree, word_counts))
def load_trees(): rstpath = "%s/" % FLAGS.rst_path constpath = "%s/" % FLAGS.const_path treepairs = [] for rstf in glob.glob(rstpath + "*.dis.tok"): basename = rstf.rsplit("/", 1)[1].split(".")[0] try: if basename.startswith("wsj"): rstline = " ".join( [line.strip() for line in open(rstf).readlines()]) rstt = RSTTree.parse(rstline) constts = [ Tree.parse(line.strip()) for line in open(constpath + basename + ".cleangold") ] treepairs.append((basename, rstt, constts)) except: print >> logs, "Failed in loading", basename return treepairs
def readkbest(f, read_gold=False): f = getfile(f) while True: #now < len(lines): line = f.readline() #lines[now] if line == '': break if line == '\n': continue try: k, tag = line.strip().split("\t") k = int(k) except: break ## can finish earlier kparses = [] for stuff in readkparses(f, int(k)): kparses.append(stuff) goldtree = Tree.parse(f.readline().strip(), trunc=True, lower=True) if read_gold \ else None yield NBestForest(k, tag, kparses, goldtree)
def load(filename, lower=False, sentid=0): '''now return a generator! use load().next() for singleton. and read the last line as the gold tree -- TODO: optional! and there is an empty line at the end ''' file = getfile(filename) line = None total_time = 0 num_sents = 0 while True: start_time = time.time() ##'\tThe complicated language in ...\n" ## tag is often missing try: if line is None or line == "\n": line = "\n" while line == "\n": line = file.readline() # emulate seek tag, sent = line.split("\t") except: ## no more forests break num_sents += 1 sent = sent.split() cased_sent = sent[:] if lower: sent = [w.lower() for w in sent] # mark johnson: lowercase all words num = int(file.readline()) forest = Forest(num, sent, cased_sent, tag) forest.labelspans = {} forest.short_edges = {} delta = num_spu = 0 for i in xrange(1, num + 1): ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n' ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ... line = file.readline() try: keys, fields = line.split(" ||| ") except: keys = line fields = "" iden, labelspan, size = keys.split( "\t") ## iden can be non-ints size = int(size) fvector = FVector(fields) node = Node(iden, labelspan, size, fvector, sent) forest.add_node(node) if cache_same: if labelspan in forest.labelspans: node.same = forest.labelspans[labelspan] node.fvector = node.same.fvector else: forest.labelspans[labelspan] = node for j in xrange(size): is_oracle = False ## '\t1 ||| 0=8.86276 1=2 3\n' tails, fields = file.readline().strip().split(" ||| ") if tails[0] == "*": #oracle edge is_oracle = True tails = tails[1:] tails = tails.split() ## could be non-integers tailnodes = [] for x in tails: assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \ "(in a hyperedge of node #%s) before being defined" % iden ## topological ordering tail = forest.nodes[x] tailnodes.append(tail) use_same = False if fields[-1] == "~": use_same = True fields = fields[:-1] fvector = FVector(fields) edge = Hyperedge(node, tailnodes, fvector) if cache_same: short_edge = edge.shorter() if short_edge in forest.short_edges: edge.same = forest.short_edges[short_edge] if use_same: edge.fvector += edge.same.fvector else: forest.short_edges[short_edge] = edge node.add_edge(edge) if is_oracle: node.oracle_edge = edge if node.sp_terminal(): node.word = node.edges[0].subs[0].word ## splitted nodes 12-3-4 => (12, 3, 4) tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder]) forest.nodeorder = [x for (_, x) in tmp] forest.rehash() sentid += 1 ## print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \ ## % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime) forest.root = node node.set_root(True) line = file.readline() if line is not None and line.strip() != "": if line[0] == "(": forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=False) line = file.readline() else: line = None total_time += time.time() - start_time if num_sents % 100 == 0: print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \ % (num_sents, total_time/num_sents) yield forest Forest.load_time = total_time print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \ % (num_sents, total_time, total_time/num_sents)
from collections import defaultdict from tree import Tree from pprint import pprint import sys fin = sys.stdin fout = sys.stdout ferr = sys.stderr trees = [Tree.parse(line.strip()) for line in fin.readlines()] def binarize(tree): # no children if tree.subs == None: return tree if len(tree.subs) <= 2: newsubs = [] for subtree in tree.subs: newsubs.append(binarize(subtree)) return Tree(tree.label, tree.span, subs=newsubs) else: subs = [tree.subs[0]] newsubs = tree.subs[1:] newspan = [subs[0].span[1], tree.span[0]] if '_' in tree.label: newlabel = tree.label else:
lexicon.add(word.strip().split()[0]) # read in grammers with open(sys.argv[1], 'r') as f: rules = f.readlines() for rule in rules[1:]: p = rule.split() k1 = p[0] if len(p) == 5: k2 = (p[2], ) runa[k1][k2] = -math.log(float(p[-1])) else: k2 = (p[2], p[3]) rbin[k1][k2] = -math.log(float(p[-1])) for line in sys.stdin: sentence_origin = line.strip().split() sentence = [] for x in sentence_origin: if len(sys.argv) > 2 and x not in lexicon: x = '<unk>' sentence.append(x) dp, back = cky(sentence) print(math.exp(-dp[0][len(sentence)]['TOP'])) res = backtrack(0, len(sentence), 'TOP', sentence_origin, back) if len(res) != 0: print debinarize(Tree.parse(res)) else: print 'NONE'
from __future__ import division from tree import Tree from collections import defaultdict import sys TRAINFILE = sys.argv[1] PRODFILE = sys.argv[2] freqs = defaultdict(int) condCounts = defaultdict(int) with open(TRAINFILE, "r") as f: for line in f.readlines(): line = line.strip() t = Tree.parse(line, trunc=True) #t.binarize() #print t.getProductions() prods = t.getProductions() for (x, y) in prods: freqs[(x, y)] += 1 condCounts[x] += 1 with open(PRODFILE, "w") as fw: for (x, y), freq in freqs.iteritems(): p = freq / condCounts[x] fw.write("%s -> %s # %.4f\n" % (x, y, p))
def load(filename, is_tforest=False, lower=False, sentid=0, first=None, lm=None): '''now returns a generator! use load().next() for singleton. and read the last line as the gold tree -- TODO: optional! and there is an empty line at the end ''' if first is None: # N.B.: must be here, not in the param line (after program initializes) first = FLAGS.first file = getfile(filename) line = None total_time = 0 num_sents = 0 while True: start_time = time.time() ##'\tThe complicated language in ...\n" ## tag is often missing line = file.readline() # emulate seek if len(line) == 0: break try: ## strict format, no consecutive breaks # if line is None or line == "\n": # line = "\n" # while line == "\n": # line = file.readline() # emulate seek tag, sent = line.split("\t") # foreign sentence except: ## no more forests yield None continue num_sents += 1 # caching the original, word-based, true-case sentence sent = sent.split() ## no splitting with " " cased_sent = sent [:] if lower: sent = [w.lower() for w in sent] # mark johnson: lowercase all words #sent = words_to_chars(sent, encode_back=True) # split to chars ## read in references refnum = int(file.readline().strip()) refs = [] for i in xrange(refnum): refs.append(file.readline().strip()) ## sizes: number of nodes, number of edges (optional) num, nedges = map(int, file.readline().split("\t")) forest = Forest(sent, cased_sent, tag, is_tforest) forest.tag = tag forest.refs = refs forest.bleu = Bleu(refs=refs) ## initial (empty test) bleu; used repeatedly later forest.labelspans = {} forest.short_edges = {} forest.rules = {} for i in xrange(1, num+1): ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n' ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ... line = file.readline() try: keys, fields = line.split(" ||| ") except: keys = line fields = "" iden, labelspan, size = keys.split("\t") ## iden can be non-ints size = int(size) fvector = Vector(fields) # ## remove_blacklist(fvector) node = Node(iden, labelspan, size, fvector, sent) forest.add_node(node) if cache_same: if labelspan in forest.labelspans: node.same = forest.labelspans[labelspan] node.fvector = node.same.fvector else: forest.labelspans[labelspan] = node for j in xrange(size): is_oracle = False ## '\t1 ||| 0=8.86276 1=2 3\n' ## N.B.: can't just strip! "\t... ||| ... ||| \n" => 2 fields instead of 3 tails, rule, fields = file.readline().strip("\t\n").split(" ||| ") if tails != "" and tails[0] == "*": #oracle edge is_oracle = True tails = tails[1:] tails = tails.split() ## N.B.: don't split by " "! tailnodes = [] lhsstr = [] # 123 "thank" 456 lmstr = [] lmscore = 0 lmlhsstr = [] for x in tails: if x[0]=='"': # word word = desymbol(x[1:-1]) lhsstr.append(word) ## desymbol here and only here; ump will call quoteattr if lm is not None: this = lm.word2index(word) lmscore += lm.ngram.wordprob(this, lmstr) lmlhsstr.append(this) lmstr += [this,] else: # variable assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \ "(in a hyperedge of node #%s) before being defined" % iden tail = forest.nodes[x] tailnodes.append(tail) lhsstr.append(tail) if lm is not None: lmstr = [] # "..." "..." x0 "..." lmlhsstr.append(tail) # sync with lhsstr fvector = Vector(fields) if lm is not None: fvector["lm1"] = lmscore # hack edge = Hyperedge(node, tailnodes, fvector, lhsstr) edge.lmlhsstr = lmlhsstr ## new x = rule.split() edge.ruleid = int(x[0]) if len(x) > 1: edge.rule = Rule.parse(" ".join(x[1:]) + " ### " + fields) forest.rules[edge.ruleid] = edge.rule #" ".join(x[1:]) #, None) else: edge.rule = forest.rules[edge.ruleid] # cahced rule node.add_edge(edge) if is_oracle: node.oracle_edge = edge if node.sp_terminal(): node.word = node.edges[0].subs[0].word ## splitted nodes 12-3-4 => (12, 3, 4) tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder]) forest.nodeorder = [x for (_, x) in tmp] forest.rehash() sentid += 1 ## print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \ ## % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime) forest.root = node node.set_root(True) line = file.readline() if line is not None and line.strip() != "": if line[0] == "(": forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=False) line = file.readline() else: line = None forest.number_nodes() #print forest.root.position_id total_time += time.time() - start_time if num_sents % 100 == 0: print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \ % (num_sents, total_time/num_sents) forest.subtree() #compute the subtree string for each node yield forest if first is not None and num_sents >= first: break # better check here instead of zero-division exception if num_sents == 0: print >> logs, "NO FORESTS FOUND!!! (empty input file?)" sys.exit(1) # yield None # new: don't halt -- WHY? Forest.load_time = total_time print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \ % (num_sents, total_time, total_time/(num_sents+0.001))
#update prob for label -> left child probdict[tree.label][updatedTag] += 1 #recurse on left child getCounts(tree.subs[0], countdict, probdict) #if right child exists, recurse on right child if len(tree.subs) > 1: getCounts(tree.subs[1], countdict, probdict) return (probdict, countdict) for line in sys.stdin: tree = Tree.parse(line) getCounts(tree, countdict, probdict) #divide probdict counts by overall num tags to get probs for i in probdict: for j in probdict[i]: probdict[i][j] /= float(countdict[i]) #print results binary = 0 unary = 0 lexical = 0 print("TOP") for i in probdict: for j in probdict[i]: if j in probdict.keys():
else: tmp = ' '.join([s.label for s in t.subs]) h[t.label][tmp] += 1 for sub in t.subs: count(sub, h) #tmp.append(tree_to_str(sub)) #return "(%s %s)" % (t.label, ' '.join(tmp)) ############################ lines = list(sys.stdin) h = defaultdict(lambda: defaultdict(lambda: 0)) for i, line in enumerate(lines): t = Tree.parse(line.strip(), trunc=False) count(t, h) #t.pp() #print(print_tree(t)) #print_subs(t) #binarize(t) #t.pp() #print(tree_to_str(t)) #print_subs(t) print('TOP') for k in h: s = sum(h[k].values()) for j in h[k]:
import itertools, collections from tree import Tree if __name__ == "__main__": try: _, parsefilename, goldfilename = sys.argv except: print >> logs, "usage: evalb.py <parse-file> <gold-file>\n" sys.exit(1) matchcount = parsecount = goldcount = 0 for parseline, goldline in itertools.izip(open(parsefilename), open(goldfilename)): goldtree = Tree.parse(goldline) goldbrackets = goldtree.label_span_counts() goldcount += sum(goldbrackets.values()) if parseline.strip() == "NONE": # parsing failure continue parsetree = Tree.parse(parseline) parsebrackets = parsetree.label_span_counts() parsecount += sum(parsebrackets.values()) for bracket, count in parsebrackets.iteritems(): matchcount += min(count, goldbrackets[bracket]) print "%s\t%d brackets" % (parsefilename, parsecount) print "%s\t%d brackets" % (goldfilename, goldcount)
print features.prep_features(sys.argv[1:]) f = sys.stdin while True: line = f.readline() if line == '': break if line == '\n': continue try: k, tag = line.strip().split("\t") except: break ## can finish earlier print k, tag k = int(k) best_w = None for j in xrange(k): logprob = float(f.readline().strip()) parse = f.readline().strip() tree = Tree.parse(parse) ##print tree if j < maxk: ## print tree features.evaluate(tree, tree.get_sent(), j) ##print features.pp_fv(fvector, j) ##fvector = features.extract(tree, tree.get_sent())
for Y in unary: if score[Y][i][j] > 0: sc = 0 sc = pdict[X][Y] * score[Y][i][j] if sc > score[X][i][j]: score[X][i][j] = sc back[X][i][j] = [Y, j] changed = True ## ferr.write('\n') ## ferr.write('Dictionary Final'+'\n') ## print_dict(score,back) try: trees.append( str( debinarize( Tree.parse(backtrace(back, start, 0, len(line)).strip())))) scores.append(score[start][0][len(line)]) except: trees.append(None) scores.append(None) ferr.write(str(trees[-1]) + '\n') ferr.write(str(scores[-1])) for tree in trees: if tree is None: fout.write(str(tree) + '\n') else: fout.write(tree + '\n')
logs = sys.stderr import itertools, collections from tree import Tree if __name__ == "__main__": try: _, parsefilename, goldfilename = sys.argv except: print >> logs, "usage: evalb.py <parse-file> <gold-file>\n" sys.exit(1) matchcount = parsecount = goldcount = 0 for parseline, goldline in itertools.izip(open(parsefilename), open(goldfilename)): goldtree = Tree.parse(goldline) goldbrackets = goldtree.label_span_counts() goldcount += len(goldbrackets) if parseline.strip() == "NONE": # parsing failure continue parsetree = Tree.parse(parseline) parsebrackets = parsetree.label_span_counts() parsecount += len(parsebrackets) for bracket, count in parsebrackets.iteritems(): matchcount += min(count, goldbrackets[bracket]) print "%s\t%d brackets" % (parsefilename, parsecount) print "%s\t%d brackets" % (goldfilename, goldcount)
import sys from tree import Tree DIRFILE = sys.argv[1] TRAINTREE = sys.argv[2] TESTTXT = sys.argv[3] TESTTREE = sys.argv[4] M = int(sys.argv[5]) N = int(sys.argv[6]) print M, N with open(DIRFILE, "r") as f: with open(TESTTXT, "w") as fw: with open(TESTTREE, "w") as fwt: l = f.readlines() for line in l[-N:]: fwt.write(line) t = Tree.parse(line.strip()) fw.write(t.tosent() + '\n') with open(DIRFILE, "r") as f: with open(TRAINTREE, "w") as ftrain: l = f.readlines() for line in l[:M]: ftrain.write(line)
res += self.debinarize(x) + ' ' else: res += self.debinarize(x) return res def parse(self, sentence_origin, kbest): sentence = [] for x in sentence_origin: if len(sys.argv) > 2 and x not in self.lexicon: x = '<unk>' sentence.append(x) dp = self.cky(sentence, kbest) result = self.backtrack(0, len(sentence), 'TOP', sentence_origin, dp) return result if __name__ == "__main__": parser = Parser() kbest = 5 for line in sys.stdin: sentence_origin = line.strip().split() result = parser.parse(sentence_origin, kbest) print(result) for score, res in result: print(score) if len(res) != 0: print(parser.debinarize(Tree.parse(res))) else: print('NONE')
def extract_frames_from_parse(parse_tree_string): """Take a string representing the parse tree as input, and print the semantic parse. The result list consists of a list of tuples, with each tuple containing the VerbNet frame and its associated tree.""" result_list = [] # In case we're handed an bad string, bail somewhat gracefully try: parse_tree = Tree.parse(parse_tree_string) except ValueError: print "Warning: semantics could not parse tree", repr(parse_tree_string) return result_list # Split clauses to handle them separately split_clause_dict = frames.split_clauses(parse_tree) # Activize clauses for key, (clause, conjunction) in split_clause_dict.items(): activized_clause = frames.activize_clause(clause) split_clause_dict[key] = (activized_clause, conjunction) for (clause, conjunction) in split_clause_dict.values(): # Split conjunctions and duplicate arguments if necessary split_tree_dict = frames.split_conjunctions(clause) if conjunction != '': result_list.append(conjunction) for (split_tree, conjunction) in split_tree_dict.values(): if conjunction != '': result_list.append(conjunction) for tree in split_tree: tag_list = [] # Store whether there was an existential there if frames.is_existential(str(tree)): tag_list.append('ex') # Transformational grammar stuff tree = frames.existential_there_insertion(tree) tree = frames.invert_clause(tree) tree = frames.wh_movement(tree) if EXTRACT_DEBUG: print 'Transformed tree:' print str(tree) verbs = frames.find_verbs(tree) # Create VFOs for each verb, then match them to the parse tree for verb, negation in verbs: lemmatized_verb = morphy(verb, 'v') vfo_list = frames.create_VerbFrameObjects(lemmatized_verb) match_list = [] if EXTRACT_DEBUG: print 'VFO list for %s:' % verb print '\n'.join(str(vfo.frame_list) for vfo in vfo_list) for vfo in vfo_list: match = vfo.match_parse(tree) if match: if EXTRACT_DEBUG: print 'Matched:' print '\t', str(vfo.frame_list) print 'with' print '\t', str(tree) match_list.append((match, vfo.classid)) if EXTRACT_DEBUG: print 'Match list:' for m in match_list: print 'Sense:', m[1] for a, b in m[0].items(): print a, str(b) print '\n\n' (best_match, sense) = frames.pick_best_match(match_list) if EXTRACT_DEBUG: print 'Chose: ' if best_match: for a, b in best_match.items(): print a, str(b) else: print str(None) print '\n\n' if not best_match is None: result_list.append((best_match, tree, tag_list, sense, verb, negation)) return result_list
#!/usr/bin/python ''' Reads parse trees from a treebank (each line contains one parse tree) Converts that tree into a binary tree (input is not necessarily binary) ''' from tree import Tree import sys for line in sys.stdin: line = line.strip() t = Tree.parse(line) # convert to binary and print t.binarize() print t
def get_semantics_from_parse_tree(parse_tree_string): """Take a string representing the parse tree as input, and print the semantic parse. The result list consists of a list of tuples, with each tuple containing the VerbNet frame and its associated tree.""" parse_tree = Tree.parse(parse_tree_string) # Split clauses to handle them separately split_clause_dict = frames.split_clauses(parse_tree) # Activize clauses for key, (clause, conjunction) in split_clause_dict.items(): activized_clause = frames.activize_clause(clause) split_clause_dict[key] = (activized_clause, conjunction) result_list = [] for (clause, conjunction) in split_clause_dict.values(): # Split conjunctions and duplicate arguments if necessary split_tree_dict = frames.split_conjunctions(clause) if conjunction != '': result_list.append(conjunction) for (split_tree, conjunction) in split_tree_dict.values(): if conjunction != '': result_list.append(conjunction) for tree in split_tree: tag_list = [] # Store whether there was an existential there if frames.is_existential(str(tree)): tag_list.append('ex') # Transformational grammar stuff tree = frames.existential_there_insertion(tree) tree = frames.invert_clause(tree) tree = frames.wh_movement(tree) # Regex for finding verbs verb_finder = re.compile(r'(?<=VB[ DGNPZ]) *\w*(?=\))') # Get the lemma of the verb for searching verbnet verbs = (word.strip().lower() for word in verb_finder.findall(str(tree))) # Create VFOs for each verb, then match them to the parse tree for verb in verbs: lemmatized_verb = morphy(verb,'v') vfo_list = frames.create_VerbFrameObjects(lemmatized_verb) match_list = [] for vfo in vfo_list: match = vfo.match_parse(tree) if match: match_list.append((match, vfo.classid)) (best_match, sense) = frames.pick_best_match(match_list) if not best_match is None: result_list.append((best_match, tree, tag_list, sense)) return result_list
def load(filename, lower=True, sentid=0): '''now return a generator! use load().next() for singleton. and read the last line as the gold tree -- TODO: optional! and there is an empty line at the end ''' file = getfile(filename) line = None total_time = 0 num_sents = 0 while True: start_time = time.time() ##'\tThe complicated language in ...\n" ## tag is often missing try: if line is None or line == "\n": line = "\n" while line == "\n": line = file.readline() # emulate seek tag, sent = line.split("\t") except: ## no more forests break num_sents += 1 sent = sent.split() cased_sent = sent [:] if lower: sent = [w.lower() for w in sent] # mark johnson: lowercase all words num = int(file.readline()) forest = Forest(num, sent, cased_sent, tag) forest.labelspans = {} forest.short_edges = {} delta = num_spu = 0 for i in xrange(1, num+1): ## '2\tDT* [0-1]\t1 ||| 1232=2 ...\n' ## node-based features here: wordedges, greedyheavy, word(1), [word(2)], ... line = file.readline() try: keys, fields = line.split(" ||| ") except: keys = line fields = "" iden, labelspan, size = keys.split("\t") ## iden can be non-ints size = int(size) fvector = FVector.parse(fields) node = Node(iden, labelspan, size, fvector, sent) forest.add_node(node) if cache_same: if labelspan in forest.labelspans: node.same = forest.labelspans[labelspan] node.fvector = node.same.fvector else: forest.labelspans[labelspan] = node for j in xrange(size): is_oracle = False ## '\t1 ||| 0=8.86276 1=2 3\n' tails, fields = file.readline().strip().split(" ||| ") if tails[0] == "*": #oracle edge is_oracle = True tails = tails[1:] tails = tails.split() ## could be non-integers tailnodes = [] for x in tails: assert x in forest.nodes, "BAD TOPOL ORDER: node #%s is referred to " % x + \ "(in a hyperedge of node #%s) before being defined" % iden ## topological ordering tail = forest.nodes[x] tailnodes.append(tail) use_same = False if fields[-1] == "~": use_same = True fields = fields[:-1] fvector = FVector.parse(fields) edge = Hyperedge(node, tailnodes, fvector) if cache_same: short_edge = edge.shorter() if short_edge in forest.short_edges: edge.same = forest.short_edges[short_edge] if use_same: edge.fvector += edge.same.fvector else: forest.short_edges[short_edge] = edge node.add_edge(edge) if is_oracle: node.oracle_edge = edge if node.sp_terminal(): node.word = node.edges[0].subs[0].word ## splitted nodes 12-3-4 => (12, 3, 4) tmp = sorted([(map(int, x.iden.split("-")), x) for x in forest.nodeorder]) forest.nodeorder = [x for (_, x) in tmp] forest.rehash() sentid += 1 ## print >> logs, "sent #%d %s, %d words, %d nodes, %d edges, loaded in %.2lf secs" \ ## % (sentid, forest.tag, forest.len, num, forest.num_edges, time.time() - basetime) forest.root = node node.set_root(True) line = file.readline() if line is not None and line.strip() != "": if line[0] == "(": forest.goldtree = Tree.parse(line.strip(), trunc=True, lower=True) line = file.readline() else: line = None total_time += time.time() - start_time if num_sents % 100 == 0: print >> logs, "... %d sents loaded (%.2lf secs per sent) ..." \ % (num_sents, total_time/num_sents) yield forest Forest.load_time = total_time print >> logs, "%d forests loaded in %.2lf secs (avg %.2lf per sent)" \ % (num_sents, total_time, total_time/num_sents)
from collections import defaultdict from tree import Tree from pprint import pprint import sys fin = sys.stdin fout = sys.stdout ferr = sys.stderr trees = [line.strip() for line in fin.readlines()] for i, tree in enumerate(trees): trees[i] = Tree.parse(tree.strip()) gcounts = defaultdict(lambda: defaultdict(lambda: float(0))) def getcfgcounts(tree, gcounts): if tree.word == None: for t in tree.subs: getcfgcounts(t, gcounts) if len(tree.subs) == 2: children = '' for t in tree.subs: children += ' ' + t.label children = children.strip() gcounts[tree.label][children] += 1 elif len(tree.subs) == 1: gcounts[tree.label][tree.subs[0].label] += 1 else: print('something odd here! tree: {} gcounts:{}'.format(