def create_taskgrammar(grammar, task, encoders): logger.info('Creating specific grammar for task %s' % task) productions = grammar.productions(Nonterminal(task)) start_token = Nonterminal('S') new_productions = [] for start_production in productions: first_token = start_production.rhs()[0] if is_nonterminal(first_token) and first_token.symbol().endswith('_TASK'): for new_start_production in grammar.productions(first_token): new_productions.append(Production(start_token, new_start_production.rhs())) else: new_productions.append(Production(start_token, start_production.rhs())) for production in grammar.productions(): for new_production in new_productions: if production.lhs() in new_production.rhs() and production not in new_productions: if production.lhs().symbol() == 'ENCODERS': # Use encoders only for types of features in the dataset if len(encoders) > 0: new_productions.append(Production(production.lhs(), [Nonterminal(e) for e in encoders])) else: new_productions.append(Production(production.lhs(), ['E'])) else: new_productions.append(production) task_grammar = CFG(start_token, new_productions) with open(TASK_GRAMMAR_PATH, 'w') as fout: fout.write('\n'.join([str(x) for x in task_grammar.productions()])) return task_grammar
def process_hybrid_productions(productions): new_productions_list = [] # list of new productions to_remove_list = [] # Hybrid production for p in productions: is_hybrid = 0 # flag that indicates if current production is hybrid if len(p.rhs() ) > 1: # more than one symbols are on the right hand side rh_list = [] # new list for right hand symbols for r_symbol in p.rhs(): if is_terminal(r_symbol): # for terminal symbol dummy_symbol = Nonterminal( r_symbol) # create dummy nonterminal new_productions_list.append( Production(dummy_symbol, [r_symbol])) # new unit production rh_list.append(dummy_symbol) is_hybrid = 1 # hybrid production confirmed else: # for nonterminal symbol rh_list.append(r_symbol) if is_hybrid: # need to remove original production and add some productions # in the loop, we won't change the list. Store them first. new_productions_list.append(Production( p.lhs(), rh_list)) # new production with dummy symbol to_remove_list.append(p) return to_remove_list, new_productions_list
def binarize(grammar): """Binarize grammar by introducing new nonterminals""" result = [] for rule in grammar.productions(): if len(rule.rhs()) > 2: # this rule needs to be broken down left_side = rule.lhs() symbol_names = [ tsym.symbol() if not isinstance(tsym, str) else '@' + tsym for tsym in rule.rhs() ] for k in range(1, len(rule.rhs()) - 1): new_rhs_name = rule.lhs().symbol() + '|<' + '-'.join( symbol_names[k:]) + '>' new_sym = Nonterminal(new_rhs_name) new_production = Production(left_side, (rule.rhs()[k - 1], new_sym)) left_side = new_sym result.append(new_production) last_prd = Production(left_side, rule.rhs()[-2:]) result.append(last_prd) else: result.append(rule) n_grammar = CFG(grammar.start(), result) return n_grammar
def reinsert_unary_chains(tree, old_grammar): old_unary_productions = [p for p in old_grammar.productions() if len(p) == 1 and p.is_nonlexical()] nodeList = [tree] while nodeList != []: node = nodeList.pop() if not isinstance(node, Tree): continue assert len(node) <= 2 nodeCopy = node.copy() children_rhs = [Nonterminal(child.label()) if not isinstance(child, str) else child for child in node] possibilities = [] possibility = [Nonterminal(node.label())] query = Production(possibility[-1], children_rhs) while query not in old_grammar.productions(): new_possibilities = [possibility + [p.rhs()[0]] for p in old_unary_productions if p.lhs() == possibility[-1]] possibilities.extend(new_possibilities) possibility = possibilities.pop(0) query = Production(possibility[-1], children_rhs) # Once a chain has been found, add it back in: node[0:] = [] # remove children lastnode = node for nt in possibility[1:]: newnode = Tree(nt.symbol(), []) lastnode[0:] = [newnode] lastnode = newnode lastnode[0:] = [child for child in nodeCopy] for child in lastnode: nodeList.append(child)
def create_grammar() -> PCFG: # 21,763 productions with word terminals # 8,028 productions with pos terminals # 6,275 productions with nonterminals without digits # 5,402 productions with nonterminals without punctuation # 2,972 productions with nonterminals without suffixes # 707 nonterminals # 190 nonterminals without digit labels # 180 nonterminals without punctuation # 63 nonterminals without suffixes productions = [] start_symbol = Nonterminal('S') for tree in nltk.corpus.treebank.parsed_sents(): for production in tree.productions(): if not valid_nonterminal(production.lhs()): continue if isinstance(production.rhs()[0], Nonterminal): lhs = simplify_nonterminal(production.lhs()) rhs = tuple( simplify_nonterminal(t) for t in production.rhs() if valid_nonterminal(t)) productions.append(Production(lhs, rhs)) else: simplified = simplify_nonterminal(production.lhs()) productions.append( Production(simplified, (simplified.symbol(), ))) grammar = nltk.induce_pcfg(start_symbol, productions) #print(grammar.productions()) print(len(grammar.productions())) nonterminals = set(prod.lhs() for prod in grammar.productions()) print(sorted(nonterminals)) print(len(nonterminals)) return grammar
def convert_hybrid(grammar): ''' Convert rules in the form of [A -> 'b' C] where the rhs has both non-terminals and terminals into rules in the form of [A -> B C] & [B -> 'b'] with a dummy non-terminal B ''' rules = grammar.productions() new_rules = [] for rule in rules: lhs = rule.lhs() rhs = rule.rhs() # check for hybrid rules if rule.is_lexical() and len(rhs) > 1: new_rhs = [] for item in rule.rhs(): if is_terminal(item): new_sym = Nonterminal(item) new_rhs.append(new_sym) # add new lexical rule with dummy lhs nonterminal new_rules.append(Production(new_sym, (item, ))) else: new_rhs.append(item) # add converted mixed rule with only non-terminals on rhs new_rules.append(Production(lhs, tuple(new_rhs))) else: new_rules.append(rule) new_grammar = CFG(grammar.start(), new_rules) return new_grammar
def test_production_from_grammar(self): grammar_str = """ S -> NP VP PP -> P NP NP -> Det N | NP PP VP -> V NP | VP PP Det -> 'a' | 'the' N -> 'dog' | 'cat' V -> 'chased' | 'sat' P -> 'on' | 'in' """ grammar = parse_cfg(grammar_str) productions = grammar.productions() expect_production = Production( lhs=Nonterminal("S"), rhs=[Nonterminal("NP"), Nonterminal("VP")]) error_msg = "Expect to find '{}', but can not see in \n{}".format( expect_production, grammar_str) self.assertIn(expect_production, productions, error_msg) expect_production = Production(lhs=Nonterminal("N"), rhs=['dog']) error_msg = "Expect to find '{}', but can not see in \n{}".format( expect_production, grammar_str) self.assertIn(expect_production, productions, error_msg) expect_not_in = Production(lhs="S", rhs=["NP", "VP"]) self.assertNotIn(expect_not_in, productions, error_msg) expect_not_in = Production(lhs=Nonterminal("N"), rhs=["'dog'"]) self.assertNotIn(expect_not_in, productions, error_msg)
def remove_unary_rules(grammar): """Remove unary nonterminal productions A -> B""" result = [] unary = [] fake_rules = [] removed_rules = [] for rule in grammar.productions(): if len(rule) == 1 and rule.is_nonlexical(): unary.append(rule) else: result.append(rule) while unary: rule = unary.pop(0) removed_rules.append(rule) for item in grammar.productions(lhs=rule.rhs()[0]): new_rule = Production(rule.lhs(), item.rhs()) if len(new_rule) != 1 or new_rule.is_lexical(): result.append(new_rule) fake_rules.append(new_rule) else: unary.append(new_rule) n_grammar = CFG(grammar.start(), result) return n_grammar, grammar
def train(): print("Collecting sub-corpus from Penn Treebank (nltk.corpus)") # prepare parsing trees, extrated from treebank tbank_trees = [] for sent in treebank.parsed_sents(): sent.chomsky_normal_form() tbank_trees.append(sent) # build vocabulary list, extracted from treebank vocab_size = 10000 # set vocabulary size to 10000 words = [wrd.lower() for wrd in treebank.words()] vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)] # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency tbank_productions = set(production for tree in tbank_trees for production in tree.productions()) tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions)) production_rules = tbank_grammar.productions() rules_to_prob = defaultdict(int) nonterm_occurrence = defaultdict(int) #calculate probablity for rules for sent in tbank_trees: for production in sent.productions(): if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal): production = Production(production.lhs(), [production.rhs()[0].lower()]) nonterm_occurrence[production.lhs()] += 1 rules_to_prob[production] += 1 for rule in rules_to_prob: rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()] # use Katz smoothing rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab) rules = list(rules_to_prob.keys()) rules_reverse_dict = dict((j,i) for i, j in enumerate(rules)) left_rules = defaultdict(set) right_rules = defaultdict(set) unary_rules = defaultdict(set) # classify left, right rules for rule in rules: if len(rule.rhs()) > 1: left_rules[rule.rhs()[0]].add(rule) right_rules[rule.rhs()[1]].add(rule) else: unary_rules[rule.rhs()[0]].add(rule) terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str)) terminal_nonterms = defaultdict(int) for rule in terminal_nonterms_rules: terminal_nonterms[rule.lhs()] += 1 pcfg_parser = { 'vocab': vocab, 'left_rules': left_rules, 'right_rules': right_rules, 'unary_rules': unary_rules, 'rules_to_prob': rules_to_prob, 'terminal_nonterms': terminal_nonterms } return pcfg_parser
def _binarize(p, so_far=[]): if len(p.rhs()) <= 2: so_far.append(p) return so_far else: new_nont = Nonterminal(p.lhs()._symbol + '_' + str(next(counter))) so_far.append(Production(p.lhs(), [p.rhs()[0], new_nont])) return _binarize(Production(new_nont, p.rhs()[1:]), so_far)
def update_dictionary(lhs_dict, whole_dict, node): production = Production(Nonterminal(node.label()), get_child_names(node)) if production.lhs() not in lhs_dict: lhs_dict[production.lhs()] = 0 if production not in whole_dict: whole_dict[production] = 0 lhs_dict[production.lhs()] += 1 whole_dict[production] += 1
def make_grammar(parse, mrepr='tokens-and-lemmas'): """ Return a list of Productions on the basis of an output parse of L{MBMA}. MBMA returns parses in the following format:: [('V|*V', 'ver'), ('V', 'eis'), ('INFLtWB', 't')] This is transformed into the following list of productions:: [PRE:ver -> 'ver', V -> PRE:ver V, V -> 'eis', INFL:t -> 't', V -> V INFL:t] Args: - parse (list): a parse return by :func:`mbmp.MBMA.classify` Returns: list -- a list of Productions. """ prods = [] for morph in parse: pos, lemma = morph.pos, morph.lemma if pos.endswith('WB'): pos = pos[:-2] leaf = morph.pprint(mrepr) # tags with '|' split all non-lexical lemmas from lexical ones if '|' in pos: superpos, pos = pos.split('|') if pos.startswith('INFL'): nonterminalpos = 'INFL:%s' % lemma nonterms = [Nonterminal(nonterminalpos), Nonterminal(pos[-1])] elif pos.endswith('INFL'): nonterminalpos = 'INFL:%s' % lemma nonterms = [Nonterminal(pos[0]), Nonterminal(nonterminalpos)] elif pos.startswith('*'): # it's a prefix nonterminalpos = 'PRE:%s' % lemma nonterms = nonterminals([nonterminalpos] + list(pos[1:])) elif pos.endswith(('*', '*WB')): # it's a suffix pos = pos[:pos.find('*')] nonterminalpos = 'SUF:%s' % lemma nonterms = nonterminals((list(pos) + [nonterminalpos])) else: # it's a linking element nonterminalpos = 'LE:%s' % lemma leidx = pos.find('*') nonterms = nonterminals( list(pos[:leidx]) + [nonterminalpos] + list(pos[leidx + 1:])) if 'x' in pos: prods.append(Production(Nonterminal('x'), [leaf])) prods.append(Production(Nonterminal(nonterminalpos), [leaf])) if nonterms: prods.append(Production(Nonterminal(superpos), nonterms)) else: prods.append(Production(Nonterminal(pos), [leaf])) return prods
def __init__(self, lhs, rhs, cost): """ Construct a new ``ProbabilisticProduction``. :param lhs: The left-hand side of the new ``ProbabilisticProduction``. :type lhs: Nonterminal :param rhs: The right-hand side of the new ``ProbabilisticProduction``. :type rhs: sequence(Nonterminal and terminal) :param prob: Probability parameters of the new ``ProbabilisticProduction``. """ ImmutableProbabilisticMixIn.__init__(self, logprob=-cost) Production.__init__(self, lhs, rhs)
def _remove_empty_productions(input_productions, letters): """Remove productions with empty right hand sides.""" copied_prods = deepcopy(input_productions) # # Find all nonterminals that generate the emptry string. # # Basis: A nonterminal generates the empty string if it is the LHS of a # production thats RHS is empty. gen_empty = [prod.lhs() for prod in copied_prods if len(prod.rhs()) == 0] N = len(gen_empty) # Induction: while True: for nonterm in gen_empty: for prod in copied_prods: if nonterm in prod.rhs(): better = list(prod.rhs()) better.remove(nonterm) prod._rhs = tuple(better) gen_empty[:] = [prod.lhs() for prod in copied_prods if len(prod.rhs()) == 0] new_len = len(gen_empty) if new_len == N: break N = new_len print 'gen_empty', gen_empty # ADD NEW RULES new_prods = [] productions = deepcopy(input_productions) for nonterm in gen_empty: prods = [prod for prod in productions if len(prod.rhs()) == 2 and nonterm in prod.rhs()] for prod in prods: rhs = list(prod.rhs()) while nonterm in rhs: lhs = prod.lhs() rhs.remove(nonterm) p = Production(lhs, tuple(rhs)) new_prods.append(p) productions += new_prods productions[:] = [p for p in productions if p.rhs()] return productions
def induce_structure(self, sentences): sentences = [[c for c in s] for s in sentences] start_symbols = set() productions = [] prod_table = {} # group all digits together digit_terminals = set([str(i) for i in range(10)]) # unary rules terminals = set() for s in sentences: terminals.update(s) for t in terminals: if t in digit_terminals: nt = nltk.Nonterminal("Digit") else: nt = nltk.Nonterminal("Unary%s" % self.gen_nt()) p = Production(nt, [t]) productions.append(p) prod_table[tuple(p.rhs())] = p.lhs() sentences = self.apply_unary_prod(sentences, prod_table) while len(sentences) > 0: if self.has_recursion(sentences): p = self.generate_recursive_prod(sentences) else: p = self.generate_most_frequent_prod(sentences) productions.append(p) prod_table[tuple(p.rhs())] = p.lhs() sentences = self.update_with_prod(sentences, prod_table) new_sentences = [] for s in sentences: if len(s) == 1: start_symbols.add(s[0]) else: new_sentences.append(s) sentences = new_sentences # generate the start productions for symbol in start_symbols: for p in productions: if p.lhs() == symbol: productions.append(Production(self.start, p.rhs())) self.grammar = nltk.induce_pcfg(self.start, productions)
def convert_unit(grammar): ''' Convert unitary rules in the form of [A -> B] where the rhs has one non-terminal by eliminating intermediate unitary rules and promoting the final lexical rule, e.g. [B -> 'b'] => [A -> 'b'] or stop at an intermediate rule with only non-terminals on the rhs like [B -> C D] => [A -> C D] ''' rules = grammar.productions() new_rules = [] unit_rules = [] for rule in rules: # check for unit rules if rule.is_nonlexical() and len(rule) == 1: unit_rules.append(rule) else: new_rules.append(rule) # following each unit rule and find the final terminal while unit_rules: rule = unit_rules.pop(0) lhs = rule.lhs() rhs = rule.rhs() # find rules that can derive the rhs to something else for cascade_rule in grammar.productions(lhs=rhs[0]): temp_rule = Production(lhs, cascade_rule.rhs()) if cascade_rule.is_lexical() or len(cascade_rule) > 1: new_rules.append(temp_rule) else: unit_rules.append(temp_rule) new_grammar = CFG(grammar.start(), new_rules) return new_grammar
def proper_rule(person: FeatStruct) -> Production: """ :person : feature structure that characterize one person and containt a proper attribute :return : the production rule that can generate the propernoun ex: "ProperName[proper=Bas] -> "Bas" """ return Production(FeatStructNonterminal("ProperName[proper=%s]" % person["proper"]), [person["proper"]])
def recursively_replace_lhs(rules, lhs, singles, keep_original): assert (all([lhs == p.lhs() for p in singles])) assert (all([len(p.rhs()) == 1 for p in singles])) out = [] for r in rules: if lhs not in r.rhs(): out += [r] else: if keep_original: out += [r] # find first occurrence for loc, t in enumerate(r.rhs()): if t == lhs: break # substitute first occurrence new_rules = [ Production( r.lhs(), list(r.rhs()[:loc]) + list(s.rhs()) + list(r.rhs()[loc + 1:])) for s in singles ] out += recursively_replace_lhs(new_rules, lhs, singles, keep_original) return out
def test_current_production(self): inputs_ = [(""" (S (sentence (type_1_sentence_coord_1 (type_1_sentence_coord_2 (type_2_sentence (THERE There) (AUX is) (Noun_Phrase (det (DET an)) (Noun_w_support (Adj_phrase (Adj_core (JJ small)) (AND and) (Adj_phrase (Adj_core (JJ red)))) (Noun_Count (NN apple))))))) (PERIOD .))) """, Production(Nonterminal("S"), [Nonterminal("sentence")]))] for i, (input_, expect_) in enumerate(inputs_): tree = Tree.parse(input_) production = current_production(tree) self.assertEqual(expect_, production)
def parse(self, phrasetokens, cleantree=True, maxtrees=200): ''' :type tokens: builtins.generator :return: ''' # check for tokens added by the POS processor -- e.g. ADV newprod = False # Add a comma and a terminal token to beginning and end of phrase COMMA = FGTerminal(',', 'COMMA', phrasetokens[-1].slice.stop) COMMA.lexentry = lexicon[(',',)] tokens = [FGTerminal('¢', 'EOP', 0)] + phrasetokens + [COMMA] + [FGTerminal('$', 'EOP', phrasetokens[-1].slice.stop)] for tokenindex, fltoken in enumerate(tokens): if not self._grammar._lexical_index.get(fltoken.lexword): newprod = True for lexent in fltoken.lexentry: lexrhs = fltoken.lexword newprod = Production(lexent, (lexrhs,)) self._grammar._productions.append(newprod) if newprod: self._grammar.__init__(self._grammar._start, self._grammar._productions) self._chart = self._parser.chart_parse([tk for tk in tokens if tk.POS != 'NULL']) # self._chart = self._parser.chart_parse([FGLeaf(tk) for tk in tokens if tk.POS != 'NULL']) treegen = self._chart.parses(self._grammar.start(), tree_class=Tree) trees = [] for i, tree in enumerate(treegen): if i >= maxtrees: break if cleantree: cleanparsetree(tree) if tree not in trees: trees.append(tree) return trees
def productions(self): prod = [] prod.append(Production(Nonterminal(self._label), self.children_name())) for i in self._child: if isinstance(i, Tree): prod.extend(i.productions()) return prod
def compact_nonterminal(x: str, nont: Nonterminal): GCFG = nltk.CFG.fromstring(x) prods = GCFG.productions() lhs_prods = [p for p in prods if p.lhs() == nont] old_prods = [p for p in prods if p not in lhs_prods] while True: new_prods = [] for p in old_prods: if nont in p.rhs(): # find first occurrence for i, t in enumerate(p.rhs()): if t == nont: break # now apply each replacement rule in turn for lhsp in lhs_prods: if i < len(p.rhs()) - 1: # if it's not the last token new_rhs = p.rhs()[:i] + lhsp.rhs() + p.rhs[(i + 1):] else: new_rhs = p.rhs()[:i] + lhsp.rhs() # purge implicit H while we're at it #new_rhs = [x for x in new_rhs if x!="'h'"] this_new_p = Production(p.lhs(), new_rhs) new_prods.append(this_new_p) else: new_prods.append(p) if new_prods == old_prods: break old_prods = new_prods new_str = ''.join([str(p).replace('\\\\', '\\') + '\n' for p in new_prods]) # print(new_str) return new_str
def process_unit_productions(productions, nonterminal_dict): # maintain a set which is same as the production list to speed up the program production_set = set(productions) need_another_loop = 0 to_remove_list = [] to_add_list = [] for p in productions: if len(p.rhs()) == 1 and is_nonterminal( p.rhs()[0]): # A->B, B is non-terminal to_remove_list.append(p) if p.rhs()[0] not in nonterminal_dict: nonterminal_dict[p.rhs()[0]] = [p.lhs()] need_another_loop = 1 elif p.lhs() not in nonterminal_dict[p.rhs()[0]]: a = nonterminal_dict[p.rhs()[0]] a.append(p.lhs()) nonterminal_dict[p.rhs()[0]] = a need_another_loop = 1 elif p.lhs() in nonterminal_dict: # B->C productions a = nonterminal_dict[p.lhs()] # productions with B on the left for item in a: # for every A in A->B new_production = Production(item, p.rhs()) # A->C if new_production not in production_set: production_set.add(new_production) # add to the grammar to_add_list.append(new_production) need_another_loop = 1 return to_add_list, nonterminal_dict, need_another_loop, to_remove_list
def cover_tree(grammar, tree): tree_productions = set(tree.productions()) gram_productions = [] pram_prods = grammar.productions() for p in pram_prods: pram_prods.append(Production(p.lhs(), p.rhs())) gram_productions = set(pram_prods) return tree_productions.issubset(gram_productions)
def add_new_vocab_rule(self, rule): """ Adds a new vocabulary rule to the set of rules, and recreates self.cfg and self.parser. """ self.rules.append(Production(NT(rule[0]), rule[1])) self.cfg = ContextFreeGrammar(NT("S"), self.rules) self.parser = EarleyChartParser(self.cfg, trace=0)
def literal_production(key, rhs): """ Return a production <key> -> n :param key: symbol for lhs: :param rhs: string literal: """ lhs = Nonterminal(key) return Production(lhs, [rhs])
def test_parse_production(self): inputs_ = [ ("PP -> P NP", Production(Nonterminal("PP"), [Nonterminal("P"), Nonterminal("NP")])), ("S -> NP VP", Production( Nonterminal("S"), [Nonterminal("NP"), Nonterminal("VP")])), ("THERE -> 'There'", Production(Nonterminal("THERE"), ['There'])) ] for i, (input_, expect_) in enumerate(inputs_): production = parse_production(input_) error_msg = "Sentence {}-th -- '{}' -- Expect result: {} / Actual result: {}".format( i, input_, expect_, production) self.assertEqual(expect_, production, error_msg)
def simple_rule(r): left = simple_nonterminal(r.lhs()) if r.is_nonlexical(): right = [] for rh in r.rhs(): right.append(simple_nonterminal(rh)) else: right = r.rhs() return Production(left, right)
def fail_demo(): """ Demo grammar that should not work with backtracking for all inputs """ from nltk.grammar import Nonterminal, Production, ContextFreeGrammar S = Nonterminal('S') A = Nonterminal('A') productions = ( Production(S, [ A, S, A ]), Production(S, [ A, A ]), Production(A, [ 'a' ]), ) grammar = ContextFreeGrammar(S, productions) text = "a a a a a a".split() #text = "a a a a".split() RecursiveDescentApp(grammar, text).mainloop()
def purge_implicit_h(x): GCFG = nltk.CFG.fromstring(x) old_prods = GCFG.productions() new_prods = [] for p in old_prods: new_prods.append(Production(p.lhs(), [x for x in p.rhs() if x != 'h'])) new_str = ''.join([str(p).replace('\\\\', '\\') + '\n' for p in new_prods]) # print(new_str) return new_str
def fix_parse_production(line, nonterm_parser, probabilistic=False): """ Parse a grammar rule, given as a string, and return a list of productions. """ pos = 0 # Parse the left-hand side. lhs, pos = nonterm_parser(line, pos) # Skip over the arrow. m = _ARROW_RE.match(line, pos) if not m: raise ValueError('Expected an arrow') pos = m.end() # Parse the right hand side. probabilities = [0.0] rhsides = [[]] while pos < len(line): # Probability. m = _PROBABILITY_RE.match(line, pos) if probabilistic and m: pos = m.end() probabilities[-1] = float(m.group(1)[1:-1]) if probabilities[-1] > 1.0: raise ValueError('Production probability %f, ' 'should not be greater than 1.0' % (probabilities[-1], )) # String -- add terminal. elif (line[pos] in "\'\"" or line[pos:pos + 2] in ('u"', "u'")): m = _TERMINAL_RE.match(line, pos) if not m: raise ValueError('Unterminated string') rhsides[-1].append(eval(m.group(1))) pos = m.end() # Vertical bar -- start new rhside. elif line[pos] == '|': m = _DISJUNCTION_RE.match(line, pos) probabilities.append(0.0) rhsides.append([]) pos = m.end() # Anything else -- nonterminal. else: nonterm, pos = nonterm_parser(line, pos) rhsides[-1].append(nonterm) if probabilistic: return [ FixPP(lhs, rhs, prob=probability) for (rhs, probability) in zip(rhsides, probabilities) ] else: return [Production(lhs, rhs) for rhs in rhsides]