def predictor(state, k, grammar): if k < len(string) and string[k] not in terminals: return elif k >= len(string): return rule, (i, dot_ix), parent_var, k, cur_tree = state.state parent_node = state.node rule = rule.split() if check_if_in_first(rule[dot_ix], nonterminals, terminals, first_set, k, string): return cur_cost = state.cost hrst = 0 for ri, (next_rule, prob) in enumerate(grammar[rule[dot_ix]]): left_most = next_rule.split()[0].strip() if check_if_in_first(left_most, nonterminals, terminals, first_set, k, string): continue next_state = (next_rule, (k, 0), rule[dot_ix], k, (rule[dot_ix], )) child_node = node(rule[dot_ix], [], parent_node, start=k) child_cost = (-k, -prob) state_tracker[k][(next_state, child_cost)] = child_node queue.add_node(next_state, child_node, child_cost, hrst)
def make_copy(parse_tree): node_copy = node(parse_tree.var, [], parse_tree.parent, start=parse_tree.start, end=parse_tree.end) for child in parse_tree.children: node_copy.children.append(make_copy(child)) return node_copy
def predictor(state, k, grammar): rule, (i, dot_ix), parent_var, k, prix, rix = state.state parent_node = state.node rule = rule.split() cur_cost = state.cost hrst = 0 for ri, (next_rule, prob) in enumerate(grammar[rule[dot_ix]]): next_state = (next_rule, (k, 0), rule[dot_ix], k, rix, ri) child_node = node(rule[dot_ix], [], parent_node, start=k) child_cost = (-k, cur_cost[1] * prob) state_tracker[k][(next_state, child_cost)] = child_node #q_entry.node queue.add_node(next_state, child_node, child_cost, hrst)
def initialize_state(start_k, first_variable, state_tracker, grammar): start_state, prob = grammar[first_variable][0] start_node = node(first_variable, [], None, start=start_k, end=-1) start_state = (start_state, (start_k, 0), first_variable, start_k, ('<P>', )) start_cost = (-start_k, -prob) #-prob state_tracker[start_k][(start_state, start_cost)] = start_node queue = AStarQ() queue.add_node(start_state, start_node, start_cost, 0) max_node = (start_node, (-start_k, -prob)) return queue, state_tracker, max_node
def scanner(state, k, max_node): rule, (i, dot_ix), parent_var, k, cur_tree = state.state parent_node = state.node split_rule = rule.split() cur_cost = state.cost hrst = 0 if k >= len(string): return max_node variations = set([ split_rule[dot_ix] + x for x in ['er', 'ish', 'est', 'der', 'dish', ''] ]) if string[k] in variations: next_state = (rule, (i, dot_ix + 1), parent_var, k + 1, (*cur_tree, (split_rule[dot_ix], ))) parent_node = make_copy(parent_node) parent_node.children.append( node(split_rule[dot_ix], [], parent_node, \ start=k, end=k+1) ) parent_node.end = max(k + 1, parent_node.end) if parent_node and parent_node.end > max_node[0].end: max_node = (parent_node, (-k - 1, cur_cost[1])) elif parent_node and parent_node.end == max_node[0].end and ( -k - 1, cur_cost[1]) < max_node[1]: max_node = (parent_node, (-k - 1, cur_cost[1])) #one look ahead if dot_ix + 1 < len(split_rule): if check_if_in_first(split_rule[dot_ix + 1], nonterminals, terminals,\ first_set, k + 1, string): return max_node state_tracker[k + 1][(next_state, (-k - 1, cur_cost[1]))] = parent_node queue.add_node(next_state, parent_node, (-k - 1, cur_cost[1]), hrst) return max_node
def scanner(state, k, grammar): rule, (i, dot_ix), parent_var, k, prix, rix = state.state parent_node = state.node split_rule = rule.split() cur_cost = state.cost hrst = 0 if k >= len(string): return variations = set( [string[k] + x for x in ['er', 'ish', 'est', 'der', 'dish', '']]) if split_rule[dot_ix] in variations: next_state = (rule, (i, dot_ix + 1), parent_var, k + 1, prix, rix) parent_node.children.append( node(split_rule[dot_ix], [], parent_node, \ start=k, end=k+1 ) ) parent_node.end = max(k + 1, parent_node.end) # state_tracker[k + 1][(next_state, (-k-1, cur_cost[1]))] = parent_node queue.add_node(next_state, parent_node, (-k - 1, cur_cost[1]), hrst)
def chart_parse(string, grammar, terminals, nonterminals, first_variable): queue = None state_tracker = [dict() for k in range(len(string) + 1)] var_ix_dict = {k : {rule[0] : (j, i) for (i, rule) in enumerate(grammar[k])} \ for (j, k) in enumerate(grammar)} ix_var_dict = {j : {i : (k, rule[0]) for (i, rule) in enumerate(grammar[k])} \ for (j, k) in enumerate(grammar)} def predictor(state, k, grammar): rule, (i, dot_ix), parent_var, k, prix, rix = state.state parent_node = state.node rule = rule.split() cur_cost = state.cost hrst = 0 for ri, (next_rule, prob) in enumerate(grammar[rule[dot_ix]]): next_state = (next_rule, (k, 0), rule[dot_ix], k, rix, ri) child_node = node(rule[dot_ix], [], parent_node, start=k) child_cost = (-k, cur_cost[1] * prob) state_tracker[k][(next_state, child_cost)] = child_node #q_entry.node queue.add_node(next_state, child_node, child_cost, hrst) def scanner(state, k, grammar): rule, (i, dot_ix), parent_var, k, prix, rix = state.state parent_node = state.node split_rule = rule.split() cur_cost = state.cost hrst = 0 if k >= len(string): return variations = set( [string[k] + x for x in ['er', 'ish', 'est', 'der', 'dish', '']]) if split_rule[dot_ix] in variations: next_state = (rule, (i, dot_ix + 1), parent_var, k + 1, prix, rix) parent_node.children.append( node(split_rule[dot_ix], [], parent_node, \ start=k, end=k+1 ) ) parent_node.end = max(k + 1, parent_node.end) # state_tracker[k + 1][(next_state, (-k-1, cur_cost[1]))] = parent_node queue.add_node(next_state, parent_node, (-k - 1, cur_cost[1]), hrst) def completer(state, k): rule, (i, dot_ix), parent_var, state_k, pprix, prix = state.state complete_node = state.node cost = state.cost hrst = 0 for (entry, cur_cost) in state_tracker[i]: cur_rule, (cur_i, cur_dot), cur_parent, cur_k, _, rix = entry cur_node = state_tracker[i][(entry, cur_cost)] split_cur_rule = cur_rule.split() if cur_dot >= len(split_cur_rule): continue if parent_var == split_cur_rule[cur_dot]: cur_node = copy.deepcopy(state_tracker[i][(entry, cur_cost)]) next_state = (cur_rule, (cur_i, cur_dot + 1), cur_parent, k, prix, rix) cur_node.children.append(complete_node) cur_node.end = complete_node.end # state_tracker[k][(next_state, (-k, cost[1]))] = cur_node queue.add_node(next_state, cur_node, (-k, cost[1]), hrst) start_state, prob = grammar[first_variable][0] start_state = (start_state, (0, 0), first_variable, 0, -1, 0) queue = AStarQ() start_node = node(first_variable, [], None, start=0, end=-1) state_tracker[0][(start_state, (0, -prob))] = start_node queue.add_node(start_state, start_node, (0, -prob), 0) while True: try: q_entry = queue.pop_node() except KeyError as e: print('cannot be parsed', e) return -1 rule, (i, dot_ix), parent_var, k, _, rix = q_entry.state rule = rule.split() cur_cost = q_entry.cost if (q_entry.state, cur_cost) not in state_tracker[k]: state_tracker[k][(q_entry.state, cur_cost)] = q_entry.node if (i, dot_ix) == (0, 1) and parent_var == '<P>' and k == len(string): return q_entry.node if not (dot_ix == len(rule)): if rule[dot_ix] in nonterminals: predictor(q_entry, k, grammar) else: scanner(q_entry, k, grammar) else: completer(q_entry, k)