def test_earley(grammar, tokens): earley_parser = nltk.EarleyChartParser(grammar, trace=1) e_chart = earley_parser.chart_parse(tokens) for edge in e_chart.edges(): print edge, edge.end() print grammarutils.earley_predict(grammar, tokens)
def predict_next_symbols(grammar, tokens): tokens = remove_duplicate(tokens) symbols = list() earley_parser = nltk.EarleyChartParser(grammar, trace=0) try: e_chart = earley_parser.chart_parse(tokens) except ValueError: return list() end_edges = list() for edge in e_chart.edges(): # print edge if edge.end() == len(tokens): # Only add terminal nodes if isinstance(edge.nextsym(), unicode): symbols.append(edge.nextsym()) end_edges.append(edge) probs = list() for end_edge in end_edges: probs.append(get_edge_prob(end_edge, e_chart, grammar)) # Eliminate duplicate symbols_no_duplicate = list() probs_no_duplicate = list() for s, p in zip(symbols, probs): if s not in symbols_no_duplicate: symbols_no_duplicate.append(s) probs_no_duplicate.append(p) else: probs_no_duplicate[symbols_no_duplicate.index(s)] += p return symbols_no_duplicate, probs_no_duplicate
def compute_grammar_prefix_probability(grammar, tokens): earley_parser = nltk.EarleyChartParser(grammar, trace=1) try: e_chart = earley_parser.chart_parse(tokens) except ValueError: return 0.0 prob = 0 # If the sentence is incomplete, return the sum of probabilities of all possible sentences for edge_idx, edge in enumerate(e_chart.edges()): if edge.end() == len(tokens) and len(edge.rhs()): # print('----------------------------') # print(edge) # print(edge.nextsym()) if not edge.nextsym(): # If the sentence is a valid complete sentence prob += get_edge_prob(edge, edge_idx, e_chart, grammar, level=0) elif isinstance(edge.nextsym(), str): # If the sentence is a valid prefix prob += get_edge_prob(edge, edge_idx, e_chart, grammar, level=0) return prob
def compute_sentence_probability(grammar, tokens): invalid_prob = 1e-20 earley_parser = nltk.EarleyChartParser(grammar, trace=0) viterbi_parser = nltk.ViterbiParser(grammar) try: e_chart = earley_parser.chart_parse(tokens) except ValueError: return 0 # d, tokens = find_closest_tokens(language, tokens) # return invalid_prob ** d # If the sentence is complete, return the Viterbi likelihood v_parses = viterbi_parser.parse_all(tokens) if v_parses: prob = functools.reduce(lambda a, b: a + b.prob(), v_parses, 0) / len(v_parses) return prob # If the sentence is incomplete, return the sum of probabilities of all possible sentences prob = 0 for edge in e_chart.edges(): if edge.end() == len(tokens) and isinstance(edge.nextsym(), str): prob += get_edge_prob(edge, e_chart, grammar) return prob
def predict_next_symbols(grammar, tokens): def get_production_prob(selected_edge): # Find the corresponding production rule of the edge, and return its probability for production in grammar.productions(lhs=selected_edge.lhs()): if production.rhs() == selected_edge.rhs(): # print selected_edge, production.prob() return production.prob() def find_parent(selected_edge): # Find the parent edges that lead to the selected edge p_edges = list() for p_edge in e_chart.edges(): if p_edge.end() == selected_edge.start() and p_edge.nextsym( ) == selected_edge.lhs(): p_edges.append(p_edge) return p_edges def get_edge_prob(selected_edge): # Compute the probability of the edge by recursion prob = get_production_prob(selected_edge) if selected_edge.start() != 0: parent_prob = 0 for parent_edge in find_parent(selected_edge): parent_prob += get_edge_prob(parent_edge) prob *= parent_prob return prob symbols = list() earley_parser = nltk.EarleyChartParser(grammar, trace=0) e_chart = earley_parser.chart_parse(tokens) end_edges = list() for edge in e_chart.edges(): # print edge if edge.end() == len(tokens): # Only add terminal nodes if isinstance(edge.nextsym(), unicode): symbols.append(edge.nextsym()) end_edges.append(edge) probs = list() for end_edge in end_edges: probs.append(get_edge_prob(end_edge)) # Eliminate duplicate symbols_no_duplicate = list() probs_no_duplicate = list() for s, p in zip(symbols, probs): if s not in symbols_no_duplicate: symbols_no_duplicate.append(s) probs_no_duplicate.append(p) else: probs_no_duplicate[symbols_no_duplicate.index(s)] += p return zip(symbols_no_duplicate, probs_no_duplicate)
def count_valid(): grammar = nltk.CFG.fromstring('\n'.join(raw_rules)) parser = nltk.EarleyChartParser(grammar) cnt = 0 for word in words: try: if len(list(parser.parse(word))): cnt += 1 except: pass return cnt
def test_grammar(): paths = config.Paths() for pcfg in os.listdir(os.path.join(paths.tmp_root, 'grammar', 'cad')): if not pcfg.endswith('.pcfg'): continue grammar_file = os.path.join(paths.tmp_root, 'grammar', 'cad', pcfg) grammar = grammarutils.read_grammar(grammar_file, index=True, mapping=datasets.cad_metadata.subactivity_index) corpus_file = os.path.join(paths.tmp_root, 'corpus', 'cad', pcfg.replace('pcfg', 'txt')) with open(corpus_file, 'r') as f: for line in f: tokens = [str(datasets.cad_metadata.subactivity_index[token]) for token in line.strip(' *#\n').split(' ')] earley_parser = nltk.EarleyChartParser(grammar, trace=0) e_chart = earley_parser.chart_parse(tokens) print(e_chart.edges()[-1])
def get_prediciton_parse_tree(grammar, tokens, filename=None): complete_tokens = sample_complete_sentence(grammar, tokens) earley_parser = nltk.EarleyChartParser(grammar, trace=0) e_chart = earley_parser.chart_parse(complete_tokens) # Select the first parse tree parse_tree = e_chart.parses(grammar.start()).next() # Save the parse tree as an image file if a filename is given if filename: cf = nltk.draw.util.CanvasFrame() tc = nltk.draw.TreeWidget(cf.canvas(), parse_tree) # Customize your own graph tc['node_font'] = 'arial 45 bold' tc['leaf_font'] = 'arial 45' tc['node_color'] = '#005990' tc['leaf_color'] = '#3F8F57' tc['line_color'] = '#175252' tc['line_width'] = '5' tc['xspace'] = 20 tc['yspace'] = 20 # Set color for the past observations. # Note that tc._leaves has more nodes than the leaves of the tree, and the last ones are displayed. for i in range( len(tc._leaves) - len(complete_tokens), len(tc._leaves) - len(complete_tokens) + len(tokens) - 1): tc._leaves[i]['color'] = '#000000' # Set color for the current observation tc._leaves[len(tc._leaves) - len(complete_tokens) + len(tokens) - 1]['color'] = '#FF0000' cf.add_widget(tc, 10, 10) # (10,10) offsets # cf.mainloop() cf.print_to_file(filename) cf.destroy() basename, ext = os.path.splitext(filename) os.system('convert {} {}'.format(filename, basename + '.png')) os.remove(filename) return parse_tree
def __init__(self, cache=None): """Inicializa un objecto de tipo AddressParser. Args: cache (dict): Ver atributo 'self._cache'. """ self._parser = nltk.EarleyChartParser(_load_grammar(_GRAMMAR_PATH)) self._token_regexp = re.compile( '|'.join('(?P<{}>{})'.format(*tt) for tt in _TOKEN_TYPES), re.IGNORECASE) self._separation_regexp = re.compile(_SEPARATION_REGEXP, re.IGNORECASE) self._normalization_regexp = re.compile( '|'.join(_NORMALIZATION_REGEXPS), re.IGNORECASE) self._cache = cache
def parse_word_tag(word, tag, sentence): rule_perphrase_c = """S -> DP | PP | AP | VP | CP | ADVP DP -> Dprime | Dprime QP | Dprime AP | Dprime CP Dprime -> D | NP | D NP | D CP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP | Nprime ADVP Nprime -> N | N PP | PP N | N QP PP -> Pprime | Pprime ADVP | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADVP | Aprime AP | Aprime CP Aprime -> A | A DP VP -> Vprime | Vprime ADVP | Vprime DP | Vprime CP Vprime -> V | V DP | V PRN CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime AP | Cprime QP | Cprime ADVP Cprime -> C | C Cprime QP -> Qprime | Qprime CP Qprime -> Q | Q NP ADVP -> ADVprime | ADVprime QP | ADVprime DP | ADVprime AP | ADVprime CP | ADVprime VP ADVprime -> ADV | ADV ADVP""" + '\n' rule_perphrase_b = """S -> DP | PP | AP | VP | CP | ADV DP -> Dprime | Dprime QP | Dprime AP | Dprime CP Dprime -> D | D NP | NP | D CP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP Nprime -> N | N PP | PP N PP -> Pprime | Pprime ADV | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADV | Aprime AP | Aprime CP Aprime -> A | A DP VP -> Vprime | Vprime ADV| Vprime DP | Vprime CP Vprime -> V | V DP | V PRN CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime QP | Cprime ADV Cprime -> C QP -> Qprime | Qprime CP Qprime -> Q""" + '\n' rule_perphrase_a = """S -> DP | PP | AP | VP | CP | ADV DP -> Dprime | Dprime QP | Dprime AP | Dprime CP Dprime -> D NP | NP | D CP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP Nprime -> N | N PP | PP N PP -> Pprime | Pprime ADV | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADV Aprime -> A | A DP VP -> Vprime | Vprime ADV | Vprime DP Vprime -> V | V DP | V PRN | Vprime CP CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime QP Cprime -> C """ + '\n' rule_test_c = """S -> DP Period | VP Period DP -> Dprime | Dprime QP | Dprime AP | Dprime CP Dprime -> D NP | NP | D CP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP Nprime -> N | N PP | PP N PP -> Pprime | Pprime ADV | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADV Aprime -> A | A DP VP -> Vprime | Vprime ADV | Vprime DP Vprime -> V | V DP | V PRN | Vprime CP CP -> Cprime | Cprime VP | Cprime DP | Cprime NP Cprime -> C """ + '\n' rule_test = """S -> DP Period | VP Period DP -> Dprime | Dprime QP | Dprime AP Dprime -> D NP | NP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N | N PP | PP N | N CP PP | PP CP N PP -> Pprime | Pprime ADV | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADV Aprime -> A | A DP VP -> Vprime | Vprime ADV | Vprime DP Vprime -> V | V DP | V PRN | Vprime CP CP -> Cprime | Cprime VP Cprime -> C | C VP | C NP """ + '\n' rule_test_b = """S -> DP VP DP -> Dprime QP | Dprime AP Dprime -> D NP PP -> Pprime ADV | Pprime VP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | V CP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N CP -> Cprime VP Cprime -> C VP | C NP """ + '\n' rule_abc = """S -> DP Period DP -> Dprime QP | Dprime AP Dprime -> D NP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N | N CP PP | PP CP N PP -> Pprime ADV | Pprime VP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | Vprime CP CP -> Cprime VP Cprime -> C VP | C NP """ + '\n' rule_test_b = """S -> DP VP DP -> Dprime QP | Dprime AP Dprime -> D NP PP -> Pprime ADV | Pprime VP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | V CP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N CP -> Cprime VP Cprime -> C VP | C NP """ + '\n' rule = """S -> NP VP Sym | VP NP Sym | VP Comma NP | NP Comma VP DP -> Dprime QP | Dprime AP Dprime -> D NP PP -> Pprime ADV | Pprime TP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | Vprime CP | V comma DP | V comma PRN | comma Vprime CP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N | N Comma PP | PP Comma N | N CP PP | PP CP N TP -> Tprime DP | Tprime Q Tprime -> Tum VP | Tin VP Tprime -> Tma AP Tprime -> Tna- PP Tprime -> Tmay VP Tprime -> Ttaga VP CP -> Cprime TP Cprime -> C TP | C NP | comma C TP | comma C NP""" + '\n' rule_backup = """S -> NP VP | VP NP DP -> Dprime QP | Dprime AP Dprime -> D NP PP -> Pprime ADV | Pprime TP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | Vprime CP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N | N CP PP | PP CP N TP -> Tprime DP | Tprime Q Tprime -> Tum VP | Tin VP Tprime -> Tma AP Tprime -> Tna- PP Tprime -> Tmay VP Tprime -> Ttaga VP CP -> Cprime TP Cprime -> C TP | C NP """ + '\n' i_tag = 0 tag_rule = [] sentence_word_tag = '' #print('tag length: ', len(tag)) while i_tag < len(tag): if "NN" in tag[i_tag]: tag_rule.append('N') elif "PR" in tag[i_tag]: tag_rule.append('N') elif "DT" in tag[i_tag]: tag_rule.append('D') elif "LM" in tag[i_tag]: tag_rule.append('C') elif "CCU" in tag[i_tag]: tag_rule.append('P') elif "CC" in tag[i_tag]: tag_rule.append('C') elif "VB" in tag[i_tag]: tag_rule.append('V') elif "JJ" in tag[i_tag]: tag_rule.append('A') elif "RB" in tag[i_tag]: tag_rule.append('ADV') elif "CD" in tag[i_tag]: tag_rule.append('Q') elif "TS" in tag[i_tag]: tag_rule.append('D') elif "FW" in tag[i_tag]: tag_rule.append('N') elif "PMP" in tag[i_tag]: tag_rule.append('Period') elif "PMC" in tag[i_tag]: tag_rule.append('C') elif "PM" in tag[i_tag]: tag_rule.append('Sym') i_word = 0 word_repeated = False while i_word < i_tag: if word[i_tag] == word[i_word]: word_repeated = True i_word += 1 #print('i_tag: ', i_tag) if not word_repeated: sentence_word_tag += tag_rule[i_tag] + " -> " + "'" + word[i_tag] + "'" + '\n' i_tag += 1 # DP = D' + QP | D' + AP # D' = D + NP # # PP = P' + ADV | P' + TP # P' = P + DP # # AP = A' + ADV # A' = A + DP # # VP = V' + ADV | V' + DP # V' = V + DP ¦ V + PRN ¦ V' + CP # # NP = N' + attribute phrase # N' = N + PP sentence_split = sentence.split() grammar = CFG.fromstring(rule_perphrase_c + sentence_word_tag) # #test uncomment to test english structure # grammar = CFG.fromstring(""" # S -> NP VP # PP -> P NP # NP -> 'the' N | N PP | 'the' N PP # VP -> V NP | V PP | V NP PP # N -> 'cat' # N -> 'dog' # N -> 'rug' # V -> 'chased' # V -> 'sat' # P -> 'in' # P -> 'on'""") # sentence_split = 'the cat chased the dog on the rug'.split() rd = RecursiveDescentParser(grammar) sr = ShiftReduceParser(grammar) chart_parser = nltk.ChartParser(grammar) earley_chart_parser = nltk.EarleyChartParser(grammar) chart_parser = earley_chart_parser print(tag_rule) parse_tree = [] print('Parse') for tree in chart_parser.parse(sentence_split): parse_tree.append(tree) if len(parse_tree) > 0: print(parse_tree[0]) else: print('NO TREE')
PP -> P NP PropN -> 'Bill' | 'Bob' | 'He' Det -> 'the' | 'a' | 'an' | 'An' | 'The' | 'A' | 'on'| 'some' N -> 'bear' | 'squirrel' | 'park' | 'block' | 'table' | 'river' | 'dog' | 'dogs'| 'pasta' | 'anchovies' | 'restaurant' | 'fork' Adj -> 'angry' | 'frightened' | 'furry' V -> 'chased' | 'saw' | 'eats' | 'eat' | 'chase' | 'Put' | 'have' P -> 'on' | 'in' | 'along' | 'with' """) ##sentence1 = "He eats pasta with a fork in the restaurant".split() ##parser1 = nltk.ChartParser(grammar) ##for tree1 in parser1.parse(sentence1): ## # print(tree1) ## print (tree1.draw()) sr = ShiftReduceParser(grammar) sentence1 = "He eats pasta with some anchovies in the restaurant" tokens = nltk.word_tokenize(sentence1) for x in sr.parse(tokens): print(x.draw()) print("-------------------------------------------------------------------") sentence1 = "He eats pasta with some anchovies in the restaurant".split() parser1 = nltk.EarleyChartParser(grammar, trace=2) for tree1 in parser1.parse(sentence1): print(tree1)
def example1(s1): grammar1 = nltk.data.load('635/syn01.cfg') ep = nltk.EarleyChartParser(grammar1) for tree in ep.parse(s1.split()): tree.draw()