def add(self, r, estcost=0.): if r.f.arity() == 1 and len(r.f) == 1: log.write("unary rule: %s\n" % r) self.unary_rules.setdefault(sym.clearindex(r.f[0]), RuleBin(self.threshold, self.limit)).add(estcost, r) self.unary_less_than.add((sym.clearindex(r.f[0]), r.lhs)) else: cur = self.root for f in r.f: if sym.isvar(f): f = sym.clearindex(f) cur[1].setdefault(f, [None, {}]) cur = cur[1][f] if cur[0] is None: cur[0] = RuleBin(self.threshold, self.limit) self.rulebin_count += 1 bin = cur[0] bin.add(estcost, r) bin.prune() self.count += 1
def parse(n, xrules, rules): """ n = length of sentence xrules = rules with position info, to be assembled into forest rules = grammar of rules from all sentences N.B. This does not work properly without tight_phrases""" chart = [[dict((v, None) for v in nonterminals) for j in xrange(n + 1)] for i in xrange(n + 1)] for l in xrange(1, n + 1): for i in xrange(n - l + 1): k = i + l for x in nonterminals: if x != START: item = forest.Item(x, i, k) for r in xrules.get((x, i, k), ()): ants = [] for fi in xrange(len(r.f)): if type(r.fpos[fi]) is tuple: (subi, subj) = r.fpos[fi] ants.append(chart[subi][subj][sym.clearindex( r.f[fi])]) if None not in ants: item.derive( ants, rules[r], r.scores[0] ) # the reason for the lookup in rules is to allow duplicate rules to be freed if len(item.deds) == 0: item = None if item is not None: chart[i][k][x] = item else: # x == START item = forest.Item(x, i, k) # S -> X if i == 0: for y in nonterminals: if y != START and chart[i][k][y] is not None: item.derive([chart[i][k][y]], gluestop[y]) # S -> S X for j in xrange(i, k + 1): for y in nonterminals: if chart[i][j][START] is not None and chart[j][k][ y] is not None: item.derive( [chart[i][j][START], chart[j][k][y]], glue[y]) if len(item.deds) > 0: chart[i][k][x] = item for ded in item.deds: ded.rule.scores = [ ded.rule.scores[0] + 1. / len(item.deds) ] covered = [False] * n spans = [] # find biggest nonoverlapping spans for l in xrange(n, 0, -1): for i in xrange(n - l + 1): k = i + l flag = False for v in reversed(nonterminals): if chart[i][k][v] is not None: flag = True if flag: for j in xrange(i, k): # don't let any of the spans overlap if covered[j]: flag = False if flag: for j in xrange(i, k): covered[j] = True spans.append((i, k)) # old buggy version #spans = [(0,n)] #sys.stderr.write("%s\n" % spans) # put in topological order itemlists = [] for (start, stop) in spans: items = [] for l in xrange(1, stop - start + 1): for i in xrange(start, stop - l + 1): k = i + l for v in nonterminals: if chart[i][k][v] is not None: items.append(chart[i][k][v]) if len(items) > 0: itemlists.append(items) return itemlists
ewords[j] = None index += 1 # Require an aligned French word if opts.require_aligned_terminal and not flag: return None epos = [] new_ewords = [] for i in xrange(elen): if ewords[i] is not None: if type(ewords[i]) is tuple: (v, ei, ej) = ewords[i] # force slash categories to be at left edge of English side if force_english_prefix and len( new_ewords) != 0 and sym.clearindex( v) in prefix_labels: return None new_ewords.append(v) epos.append((ei, ej)) else: new_ewords.append(ewords[i]) epos.append(i + j1) r = XRule(x, rule.Phrase(tuple(fwords)), rule.Phrase(tuple(new_ewords))) r.fpos = fpos r.epos = epos r.span = (i1, i2, j1, j2) if opts.keep_word_alignments: r.word_alignments = [] for fi in xrange(len(fpos)):
def parse(n, xrules, rules): """ n = length of sentence xrules = rules with position info, to be assembled into forest rules = grammar of rules from all sentences N.B. This does not work properly without tight_phrases""" chart = [[dict((v,None) for v in nonterminals) for j in xrange(n+1)] for i in xrange(n+1)] for l in xrange(1, n+1): for i in xrange(n-l+1): k = i+l for x in nonterminals: if x != START: item = forest.Item(x,i,k) for r in xrules.get((x,i,k), ()): ants = [] for fi in xrange(len(r.f)): if type(r.fpos[fi]) is tuple: (subi, subj) = r.fpos[fi] ants.append(chart[subi][subj][sym.clearindex(r.f[fi])]) if None not in ants: item.derive(ants, rules[r], r.scores[0]) # the reason for the lookup in rules is to allow duplicate rules to be freed if len(item.deds) == 0: item = None if item is not None: chart[i][k][x] = item else: # x == START item = forest.Item(x,i,k) # S -> X if i == 0: for y in nonterminals: if y != START and chart[i][k][y] is not None: item.derive([chart[i][k][y]], gluestop[y]) # S -> S X for j in xrange(i,k+1): for y in nonterminals: if chart[i][j][START] is not None and chart[j][k][y] is not None: item.derive([chart[i][j][START], chart[j][k][y]], glue[y]) if len(item.deds) > 0: chart[i][k][x] = item for ded in item.deds: ded.rule.scores = [ded.rule.scores[0] + 1./len(item.deds)] covered = [False] * n spans = [] # find biggest nonoverlapping spans for l in xrange(n,0,-1): for i in xrange(n-l+1): k = i+l flag = False for v in reversed(nonterminals): if chart[i][k][v] is not None: flag = True if flag: for j in xrange(i,k): # don't let any of the spans overlap if covered[j]: flag = False if flag: for j in xrange(i,k): covered[j] = True spans.append((i,k)) # old buggy version #spans = [(0,n)] #sys.stderr.write("%s\n" % spans) # put in topological order itemlists = [] for (start, stop) in spans: items = [] for l in xrange(1, stop-start+1): for i in xrange(start, stop-l+1): k = i+l for v in nonterminals: if chart[i][k][v] is not None: items.append(chart[i][k][v]) if len(items) > 0: itemlists.append(items) return itemlists
for j in xrange(sub_j1+1,sub_j2): ewords[j] = None index += 1 # Require an aligned French word if opts.require_aligned_terminal and not flag: return None epos = [] new_ewords = [] for i in xrange(elen): if ewords[i] is not None: if type(ewords[i]) is tuple: (v, ei, ej) = ewords[i] # force slash categories to be at left edge of English side if force_english_prefix and len(new_ewords) != 0 and sym.clearindex(v) in prefix_labels: return None new_ewords.append(v) epos.append((ei,ej)) else: new_ewords.append(ewords[i]) epos.append(i+j1) r = XRule(x,rule.Phrase(tuple(fwords)), rule.Phrase(tuple(new_ewords))) r.fpos = fpos r.epos = epos r.span = (i1,i2,j1,j2) if opts.keep_word_alignments: r.word_alignments = []