def get_cfg(self, goal, root): """ Constructs the CFG by visiting complete items in a top-down fashion. This is effectively a reachability test and it serves the purpose of filtering nonterminal symbols that could never be reached from the root. Note that bottom-up intersection typically does enumerate a lot of useless (unreachable) items. This is the recursive procedure described in the paper (Nederhof and Satta, 2008). """ G = WCFG() processed = set() fsa = self._wfsa itergenerating = self._agenda.itergenerating itercomplete = self._agenda.itercomplete def make_rules(lhs, start, end): if (start, lhs, end) in processed: return processed.add((lhs, start, end)) for item in itercomplete(lhs, start, end): G.add(self.get_intersected_rule(item)) fsa_states = item.inner + (item.dot, ) for i, sym in itertools.ifilter( lambda (_, s): is_nonterminal(s), enumerate(item.rule.rhs)): if ( sym, fsa_states[i], fsa_states[i + 1] ) not in processed: # Nederhof does not perform this test, but in python it turned out crucial make_rules(sym, fsa_states[i], fsa_states[i + 1])
def make_grammar(fsa): cfg = WCFG() cfg.add(Rule('[S]', ['[X]'], 0.0)) cfg.add(Rule('[X]', ['[X]', '[X]'], 0.0)) for word in fsa.itersymbols(): cfg.add(Rule('[X]', [word], 0.0)) return cfg
def main(args): wcfg = WCFG(read_grammar_rules(args.grammar)) #print 'GRAMMAR' #print wcfg for input_str in args.input: wfsa = make_linear_fsa(input_str) #print 'FSA' #print wfsa parser = Earley(wcfg, wfsa) forest = parser.do('[S]', '[GOAL]') if not forest: print 'NO PARSE FOUND' continue new_rules = [] for rule in forest: if len(rule.rhs) > 1 and all(map(is_nonterminal, rule.rhs)): new_rules.append( Rule(rule.lhs, reversed(rule.rhs), rule.log_prob)) [forest.add(rule) for rule in new_rules] print '# FOREST' print forest print if args.show_permutations: print '# PERMUTATIONS' counts = count_derivations(forest, '[GOAL]') total = 0 for p, n in sorted(counts['p'].iteritems(), key=lambda (k, v): k): print 'permutation=(%s) derivations=%d' % (' '.join( str(i) for i in p), n) total += n print 'permutations=%d derivations=%d' % (len( counts['p'].keys()), total) print
def initialise(wcfg, wfsa, root, goal, intersection): """ Calculate a first derivation based on a simpler (thus smaller/faster) version of the grammar Thereby determining the initial conditions. Only applicable with the 'milos' grammar format, i.e. non-terminals have the form: '[P1234*2_1]' """ smaller = WCFG([]) logging.debug('Creating a smaller grammar for initial conditions...') for line in wcfg: if 0 < permutation_length(line.lhs) <= 2: smaller.add(line) elif line.lhs == root or line.lhs == '[UNK]': smaller.add(line) if intersection == 'nederhof': init_parser = Nederhof(smaller, wfsa) elif intersection == 'earley': init_parser = Earley(smaller, wfsa) else: raise NotImplementedError('I do not know this algorithm: %s' % intersection) logging.debug('Init Parsing...') init_forest = init_parser.do(root, goal) if not init_forest: print 'NO PARSE FOUND' return {} else: logging.debug('Forest: rules=%d', len(init_forest)) logging.debug('Init Topsorting...') # sort the forest sorted_nodes = top_sort(init_forest) # calculate the inside weight of the sorted forest logging.debug('Init Inside...') init_inside_prob = inside(init_forest, sorted_nodes) logging.debug('Init Sampling...') gen_sampling = GeneralisedSampling(init_forest, init_inside_prob) init_d = gen_sampling.sample(goal) return get_conditions(init_d)
def read_grammar(rules_file, lexicon_file, transform=math.log): return WCFG( chain(iterrules(rules_file, transform), iterlexicon(lexicon_file, transform)))