def add_multiconstituents(a, maxabslen, ephrase_index, consts): elen = len(a.ewords) chart = [[None for ej in xrange(elen + 1)] for ei in xrange(elen + 1)] for ((ei, ej), labels) in a.espans.iteritems(): chart[ei][ej] = [labels[0]] # take the highest label for el in xrange(2, maxabslen + 1): for ei in xrange(elen - el + 1): ej = ei + el if chart[ei][ej] is not None: # must be a singleton continue bestsplit = None bestlen = None for ek in xrange(ei + 1, ej): if chart[ei][ek] is not None and chart[ek][ej] is not None and ( bestlen is None or len(chart[ei][ek]) + len(chart[ek][ej]) < bestlen): bestsplit = ek bestlen = len(chart[ei][ek]) + len(chart[ek][ej]) if bestlen is not None and bestlen <= consts: chart[ei][ej] = chart[ei][bestsplit] + chart[bestsplit][ej] for (ei, ej) in ephrase_index: if not a.espans.has_key((ei, ej)) and chart[ei][ej] is not None: a.espans[ei, ej] = [ sym.fromtag("_".join(sym.totag(x) for x in chart[ei][ej])) ]
def add_constituent_prefixes(a, ephrase_index): """if a phrase is a prefix of a constituent, give it a fake label""" if log.level >= 3: log.write( str([(i, j, sym.tostring(x)) for ((i, j), l) in a.espans.iteritems() for x in l])) log.write("\n") ei_index = {} for ((ei, ej), labels) in a.espans.iteritems(): ei_index.setdefault(ei, []).extend([(ej, x) for x in reversed(labels)]) for ei in ei_index.iterkeys(): ei_index[ei].sort() # stable for (ei, ej) in ephrase_index: if True or not (a.espans.has_key( (ei, ej)) and len(a.espans[ei, ej]) > 0): for (ej1, x) in ei_index.get(ei, []): if ej1 > ej: x1 = sym.fromtag(sym.totag(x) + "*") a.espans.setdefault((ei, ej), []).append(x1) prefix_labels.add(x1) break if log.level >= 3: log.write( str([(i, j, sym.tostring(x)) for ((i, j), l) in a.espans.iteritems() for x in l])) log.write("\n---\n")
def add_sister_prefixes_helper(a, ephrases, enode, i): """if a phrase comprises one or more (but not all) leftmost children of a constituent, then add it and give it a fake label""" j = i + enode.length if log.level >= 3: log.write("(i,j) = %s\n" % ((i, j), )) x = enode.label j1 = i for ci in xrange(len(enode.children)): child = enode.children[ci] j1 += child.length if log.level >= 3: log.write("(i,j1) = %s\n" % ((i, j1), )) if j1 < j and (i, j1) in ephrases: # constprefix3: #x1 = sym.fromtag("%s*" % x) # subcat-lr2: #subcat = [sister.label for sister in enode.children[ci+1:] if sister.required] #x1 = sym.fromtag("/".join(["%s*"%x]+subcat)) # markov1: x1 = sym.fromtag("%s/%s" % (x, enode.children[ci + 1].label)) # markov2: #x1 = sym.fromtag("%s(%s)" % (x, enode.children[ci].label)) a.espans.setdefault((i, j1), []).append(x1) prefix_labels.add(x1) for child in enode.children: add_sister_prefixes_helper(a, ephrases, child, i) i += child.length
def add_sister_prefixes_helper(a, ephrases, enode, i): """if a phrase comprises one or more (but not all) leftmost children of a constituent, then add it and give it a fake label""" j = i+enode.length if log.level >= 3: log.write("(i,j) = %s\n" % ((i,j),)) x = enode.label j1 = i for ci in xrange(len(enode.children)): child = enode.children[ci] j1 += child.length if log.level >= 3: log.write("(i,j1) = %s\n" % ((i,j1),)) if j1 < j and (i,j1) in ephrases: # constprefix3: #x1 = sym.fromtag("%s*" % x) # subcat-lr2: #subcat = [sister.label for sister in enode.children[ci+1:] if sister.required] #x1 = sym.fromtag("/".join(["%s*"%x]+subcat)) # markov1: x1 = sym.fromtag("%s/%s" % (x, enode.children[ci+1].label)) # markov2: #x1 = sym.fromtag("%s(%s)" % (x, enode.children[ci].label)) a.espans.setdefault((i,j1),[]).append(x1) prefix_labels.add(x1) for child in enode.children: add_sister_prefixes_helper(a, ephrases, child, i) i += child.length
def make_forest(fieldss): nodes = {} goal_ids = set() for fields in fieldss: node_id = fields['hyp'] if node_id not in nodes: nodes[node_id] = forest.Item(sym.fromtag('PHRASE'), 0, 0, []) node = nodes[node_id] if node_id == 0: r = rule.Rule(sym.fromtag('PHRASE'), rule.Phrase([]), rule.Phrase([])) node.deds.append(forest.Deduction((), r, svector.Vector())) else: m = scores_re.match(fields['scores']) core_values = [float(x) for x in m.group(1).split(',')] dcost = svector.Vector(m.group(2).encode('utf8')) for i, x in enumerate(core_values): dcost["_core%d" % i] = x back = int(fields['back']) ant = nodes[back] f = fields['src-phrase'].encode('utf8').split() e = fields['tgt-phrase'].encode('utf8').split() if len(f) != int(fields['cover-end']) - int(fields['cover-start']) + 1: sys.stderr.write("warning: French phrase length didn't match covered length\n") f = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + f) e = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + e) r = rule.Rule(sym.fromtag('PHRASE'), f, e) ded = forest.Deduction((ant,), r, dcost) node.deds.append(ded) if int(fields['forward']) < 0: # goal goal_ids.add(node_id) goal = forest.Item(None, 0, 0, []) for node_id in goal_ids: goal.deds.append(forest.Deduction((nodes[node_id],), None, svector.Vector())) return goal
def _str_helper(self, item, accum): ded = self.ded[id(item)] if ded.rule: x = ded.rule.lhs else: x = sym.fromtag("-") if len(ded.ants) > 0: accum.extend(["(", sym.totag(x)]) for ant in ded.ants: accum.append(" ") self._str_helper(ant, accum) accum.append(")") else: accum.append(sym.totag(x))
def add_bounded_prefixes_helper(a, phrases, node, i, stack): j = i + node.length if node.label in ['NP']: stack = stack + [(node.label, i, j)] else: stack = [(node.label, i, j)] i1 = i for child in node.children: if i1 > i: for (x, i0, j0) in stack: if (i0, i1) in phrases: x1 = sym.fromtag("%s*" % x) a.espans.setdefault((i0, i1), []).append(x1) prefix_labels.add(x1) add_bounded_prefixes_helper(a, phrases, child, i1, stack) i1 += child.length
def add_bounded_prefixes_helper(a, phrases, node, i, stack): j = i+node.length if node.label in ['NP']: stack = stack+[(node.label,i,j)] else: stack = [(node.label,i,j)] i1 = i for child in node.children: if i1 > i: for (x,i0,j0) in stack: if (i0,i1) in phrases: x1 = sym.fromtag("%s*" % x) a.espans.setdefault((i0,i1),[]).append(x1) prefix_labels.add(x1) add_bounded_prefixes_helper(a, phrases, child, i1, stack) i1 += child.length
def input(self, input): self.rules = collections.defaultdict(list) for tag, attrs, i, j in input.fmeta: attrs = sgml.attrs_to_dict(attrs) if attrs.has_key('english'): ephrases = attrs['english'].split('|') if attrs.has_key('cost'): costs = [float(x) for x in attrs['cost'].split('|')] elif attrs.has_key('prob'): costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')] else: costs = [-math.log10(1.0/len(ephrases)) for e in ephrases] # uniform if len(costs) != len(ephrases): sys.stderr.write("wrong number of probabilities/costs") raise ValueError if attrs.has_key('features'): features = attrs['features'].split('|') if len(features) != len(ephrases): sys.stderr.write("wrong number of feature names") raise ValueError elif attrs.has_key('feature'): features = [attrs['feature'] for ephrase in ephrases] else: features = ['sgml' for ephrase in ephrases] if attrs.has_key('label'): tags = attrs['label'].split('|') else: tags = [tag.upper()] # bug: if new nonterminals are introduced at this point, # they will not participate in the topological sort for (ephrase,cost,feature) in zip(ephrases,costs,features): for tag in tags: r = rule.Rule(sym.fromtag(tag), rule.Phrase(input.fwords[i:j]), rule.Phrase([sym.fromstring(e) for e in ephrase.split()]), scores=svector.Vector('%s' % feature, cost)) self.rules[i,j].append((r,))
def input(self, lat): self.rules = collections.defaultdict(list) for span in lat.spans: i, j = span.i, span.j if hasattr(span, 'v'): v = svector.Vector(span.v) else: v = model.zero # bug: if new nonterminals are introduced at this point, # they will not participate in the topological sort r = rule.Rule(sym.fromtag(span.x), rule.Phrase([sym.fromstring(f) for f in span.f]), rule.Phrase([sym.fromstring(e) for e in span.e]), scores=v) self.rules[i,j].append((r,)) if log.level >= 2: log.write("added lattice rule at (%d,%d): %s\n" % (i,j,r))
def add_multiconstituents(a, maxabslen, ephrase_index, consts): elen = len(a.ewords) chart = [[None for ej in xrange(elen+1)] for ei in xrange(elen+1)] for ((ei,ej),labels) in a.espans.iteritems(): chart[ei][ej] = [labels[0]] # take the highest label for el in xrange(2,maxabslen+1): for ei in xrange(elen-el+1): ej = ei+el if chart[ei][ej] is not None: # must be a singleton continue bestsplit = None bestlen = None for ek in xrange(ei+1,ej): if chart[ei][ek] is not None and chart[ek][ej] is not None and (bestlen is None or len(chart[ei][ek])+len(chart[ek][ej]) < bestlen): bestsplit = ek bestlen = len(chart[ei][ek])+len(chart[ek][ej]) if bestlen is not None and bestlen <= consts: chart[ei][ej] = chart[ei][bestsplit]+chart[bestsplit][ej] for (ei,ej) in ephrase_index: if not a.espans.has_key((ei,ej)) and chart[ei][ej] is not None: a.espans[ei,ej] = [sym.fromtag("_".join(sym.totag(x) for x in chart[ei][ej]))]
def add_constituent_prefixes(a, ephrase_index): """if a phrase is a prefix of a constituent, give it a fake label""" if log.level >= 3: log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ])) log.write("\n") ei_index = {} for ((ei,ej),labels) in a.espans.iteritems(): ei_index.setdefault(ei, []).extend([(ej,x) for x in reversed(labels)]) for ei in ei_index.iterkeys(): ei_index[ei].sort() # stable for (ei,ej) in ephrase_index: if True or not (a.espans.has_key((ei,ej)) and len(a.espans[ei,ej]) > 0): for (ej1,x) in ei_index.get(ei,[]): if ej1 > ej: x1 = sym.fromtag(sym.totag(x)+"*") a.espans.setdefault((ei,ej),[]).append(x1) prefix_labels.add(x1) break if log.level >= 3: log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ])) log.write("\n---\n")
_item_to_text(node.ants[sym.getindex(child) - 1], result, memo, mode=mode, weights=weights) else: result.append(' ') result.append(quoteattr(sym.tostring(child))) result.append(')') class TreeFormatException(Exception): pass dummylabel = sym.fromtag("X") dummyi = dummyj = None whitespace = re.compile(r"\s+") openbracket = re.compile(r"""(?:#(\d+))?\((\S+)""") noderefre = re.compile(r"#([^)\s]+)") labelre = re.compile(r"^(-?\d*)(?:<(\S+)>)?$") def forest_lexer(s): si = 0 while si < len(s): m = whitespace.match(s, si) if m: si = m.end() continue
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]): """Currently this assumes that the only frontier nodes in the tree are words.""" while True: try: tok = tokiter.next() toktype = tok[0] except StopIteration: raise TreeFormatException("incomplete tree") if toktype == "or": _, nodeid = tok deds = list( forest_from_text_helper(tokiter, memo, delete_words=delete_words)) node = Item(dummylabel, dummyi, dummyj, deds=deds) if nodeid: memo[nodeid] = node yield node elif toktype == "nonterm": _, nodeid, ruleid, dcoststr = tok if ruleid == "": ruleid = dummylabel else: ruleid = sym.fromtag(ruleid) dcost = svector.Vector() if dcoststr: for fv in dcoststr.split(','): f, v = fv.split(':', 1) v = float(v) dcost[f] = v ants = [] rhs = [] vi = 1 for child in forest_from_text_helper(tokiter, memo, want_item=True, delete_words=delete_words): if isinstance(child, Item): ants.append(child) rhs.append(sym.setindex(dummylabel, vi)) vi += 1 else: rhs.append(child) r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs)) node = Deduction(ants=ants, rule=r, dcost=dcost) if want_item: # need to insert OR node node = Item(dummylabel, dummyi, dummyj, deds=[node]) if nodeid: memo[nodeid] = node yield node elif toktype == 'term': terminal = tok[1] if terminal not in delete_words: yield sym.fromstring(terminal) elif toktype == 'ref': yield memo[tok[1]] elif toktype == 'pop': return else: raise TreeFormatException("unknown token %s" % (tok, ))
if etree is None: sys.stderr.write("warning, line %d: null tree" % a.lineno) a.espans = {} elif etree.length != len(a.ewords): sys.stderr.write( "warning, line %d: length mismatch between English words and trees (%d != %d)\n" % (a.lineno, len(a.ewords), etree.length)) sys.stderr.write( " start of English sentence: %s\n" % " ".join([sym.tostring(x) for x in a.ewords[:5]])) a.espans = {} else: remove_req(etree) a.espans = etree.spans() for (span, labels) in a.espans.iteritems(): a.espans[span] = [sym.fromtag(x) for x in labels] # done reading all input lines if opts.discard_long_sentences and len(a.fwords) > opts.maxabslen: continue realcount += 1 if opts.parallel is not None: if realcount % opts.parallel[1] != opts.parallel[ 0] % opts.parallel[1]: continue for feature in features: feature.process_alignment(a) phrases = extract_phrases(a, maxabslen)
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]): """Currently this assumes that the only frontier nodes in the tree are words.""" while True: try: tok = tokiter.next() toktype = tok[0] except StopIteration: raise TreeFormatException("incomplete tree") if toktype == "or": _, nodeid = tok deds = list(forest_from_text_helper(tokiter, memo, \ delete_words=delete_words)) node = Item(dummylabel, dummyi, dummyj, deds=deds) if nodeid: memo[nodeid] = node node.nodeid = nodeid yield node elif toktype == "nonterm": _, nodeid, ruleid, dcoststr = tok if ruleid == "": ruleid = dummylabel else: # lhuang: N.B.: sym.fromtag would re-alloc it xrs_ruleid = int(ruleid) ruleid = sym.fromtag(ruleid) #int(ruleid) # dcost = svector.Vector() if dcoststr: # lhuang: features are read from forest, not rules # so there is no "e^..." or "10^..." for fv in dcoststr.split(','): f,v = fv.split(':',1) v = float(v) dcost[f] = v ants = [] rhs = [] vi = 1 for child in forest_from_text_helper(tokiter, memo, want_item=True,\ delete_words=delete_words): if isinstance(child, Item): ants.append(child) rhs.append(sym.setindex(dummylabel, vi)) vi += 1 else: rhs.append(child) r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs)) node = Deduction(ants=ants, rule=r, dcost=dcost) node.ruleid = xrs_ruleid if want_item: # need to insert OR node node = Item(dummylabel, dummyi, dummyj, deds=[node]) if nodeid: memo[nodeid] = node yield node elif toktype == 'term': terminal = tok[1] if terminal not in delete_words: yield sym.fromstring(terminal) elif toktype == 'ref': yield memo[tok[1]] elif toktype == 'pop': return else: raise TreeFormatException("unknown token %s" % (tok,))
a.espans = None if opts.trees: if ebfile is not None: etree = tree.str_to_tree(ebfile.readline()) if etree is None: sys.stderr.write("warning, line %d: null tree" % a.lineno) a.espans = {} elif etree.length != len(a.ewords): sys.stderr.write("warning, line %d: length mismatch between English words and trees (%d != %d)\n" % (a.lineno, len(a.ewords), etree.length)) sys.stderr.write(" start of English sentence: %s\n" % " ".join([sym.tostring(x) for x in a.ewords[:5]])) a.espans = {} else: remove_req(etree) a.espans = etree.spans() for (span, labels) in a.espans.iteritems(): a.espans[span] = [sym.fromtag(x) for x in labels] # done reading all input lines if opts.discard_long_sentences and len(a.fwords) > opts.maxabslen: continue realcount += 1 if opts.parallel is not None: if realcount % opts.parallel[1] != opts.parallel[0] % opts.parallel[1]: continue for feature in features: feature.process_alignment(a) phrases = extract_phrases(a, maxabslen) if opts.loosen:
result.append(' var ') _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights) else: # lhuang: english word result.append(' word ') w = quoteattr(sym.tostring(child)) result.append(w) print w, result.append(')') print # end of a hyperedge class TreeFormatException(Exception): pass dummylabel = sym.fromtag("-") dummyi = dummyj = None whitespace = re.compile(r"\s+") openbracket = re.compile(r"""(?:#(\d+))?\((\S+)""") noderefre = re.compile(r"#([^)\s]+)") labelre = re.compile(r"^(-?\d*)(?:<(\S+)>)?$") def forest_lexer(s): si = 0 while si < len(s): m = whitespace.match(s, si) if m: si = m.end() continue
for child in children: if isinstance(child, Item): result.append(' ') _item_to_text(child, result, memo, mode=mode, weights=weights) elif sym.isvar(child): result.append(' ') _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights) else: result.append(' ') result.append(quoteattr(sym.tostring(child))) result.append(')') class TreeFormatException(Exception): pass dummylabel = sym.fromtag("X") dummyi = dummyj = None whitespace = re.compile(r"\s+") openbracket = re.compile(r"""(?:#(\d+))?\((\S+)""") noderefre = re.compile(r"#([^)\s]+)") labelre = re.compile(r"^(-?\d*)(?:<(\S+)>)?$") def forest_lexer(s): si = 0 while si < len(s): m = whitespace.match(s, si) if m: si = m.end() continue