def main(args): sentence = args.sentence.lower() args.sentence = sentence tokens = sentence.split() grammar = loadGrammar(args) nonterm = getnonterm(grammar) terminalProductionRules = getTerminalProbability(args, grammar, nonterm) HSrules = grammar.productions(Nonterminal('HS')) for rule in HSrules: grammar.productions().remove(rule) ESrules = grammar.productions(Nonterminal('ES')) for rule in ESrules: grammar.productions().remove(rule) grammar.productions().extend(terminalProductionRules) for token in tokens: grammar.productions().append( ProbabilisticProduction(Nonterminal(token.upper()), [unicode(token)], prob=1)) #print "Grammars" grammarlist = str(grammar).split('\n')[1:] #print "Transfered" strgrammar = '' for p in grammar.productions(): rhs = p.rhs() rhsstr = '' for r in rhs: if is_terminal(r): rhsstr += '\'' + str(r) + '\' ' else: rhsstr += str(r) + ' ' strgrammar += str(p.lhs()) + ' -> ' + rhsstr + ' [' + '{0:.8f}'.format( p.prob()) + ']\n' #print strgrammar grammar = PCFG.fromstring(strgrammar.split('\n')) #''' #grammar = loadGrammar(args) #tokens = args.sentence.lower().split() #nonterm = getnonterm(grammar) CYK(tokens, nonterm, grammar) #with open(args.grammar_file, 'r') as f: # content = f.read() #trees = corpus2trees(content) #productions = trees2productions(trees) #listnonterm = [] #grammar = nltk.grammar.induce_pcfg(nltk.grammar.Nonterminal('SS'), productions) #print grammar #''' '''
def pcfg_bcl(C, alpha=ALPHA, gd_thr=LPG_DIFF_THRESHOLD, mc_thr=MC_THRESHOLD): print("\ninitializing...") global ALPHA global LPG_DIFF_THRESHOLD global MC_THRESHOLD global and_symb_count global or_symb_count global ignore_mc_ec ALPHA = alpha LPG_DIFF_THRESHOLD = gd_thr MC_THRESHOLD = mc_thr and_symb_count = 0 or_symb_count = 0 ignore_mc_ec = False ## create an empty grammar G S = Nonterminal("_START_") R = [ProbabilisticProduction(S, [""], prob=1.)] G = PCFG(S, R) T = _create_t(C) # create a table T ## repeat until no further rule to be learned i = 0 while not _finished(T): i += 1 print("\niter. n° %d" % (i,)) found, G, C, T, N = _learning_by_biclustering(G, C, T) if not found: print("NO MORE RULES CAN BE LEARNED") break G, C, T = _attaching(N, G, C, T) G = _postprocessing(G, C) print("\n", G) # DEBUG return G
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. start -- start symbol. horzMarkov -- None for default. A number n >= 0 for horizontal markov. """ self.start = start count_Y_Z = defaultdict(lambda: defaultdict(int)) count_X = defaultdict(int) for t in parsed_sents: # it's a copy of tree. We don't want to modify the original tree. # mutable structures unle_trees = unlexicalize(t.copy(deep=True)) # chomsky normal form with horizontal markov. unle_trees.chomsky_normal_form(horzMarkov=horzMarkov) # collapse subtrees with a single child. unle_trees.collapse_unary(collapsePOS=True) for prod in unle_trees.productions(): count_Y_Z[prod.lhs()][prod.rhs()] += 1 count_X[prod.lhs()] += 1 # create a list of productions. productions = [] for X, c_X in count_X.items(): for (Y_Z, c_Y_Z) in count_Y_Z[X].items(): q = c_Y_Z / float(c_X) productions.append(ProbabilisticProduction(X, Y_Z, prob=q)) self.production = productions grammar = PCFG(Nonterminal(start), productions) self.parser = CKYParser(grammar)
def random_sentences(grammar_string, n=None, depth=5): grammar = PCFG.fromstring(grammar_string) i = 0 while True: if i == n: return tree = generate(grammar, depth=depth) yield ' '.join(utils.flatten(tree)) i += 1
def test1(test_str): toy_pcfg = PCFG.fromstring(""" S -> A A [0.8] | A B [.1] | A S [.1] A -> A A [.6] | A B [.2] | 'a' [.2] B -> B A [.3] | A B [.2] | 'b' [.5] """) for t in CYK(toy_pcfg,test_str.split(" "),5): t.draw()
def test1(test_str): toy_pcfg = PCFG.fromstring( """ S -> A A [0.8] | A B [.1] | A S [.1] A -> A A [.6] | A B [.2] | 'a' [.2] B -> B A [.3] | A B [.2] | 'b' [.5] """ ) for t in CYK(toy_pcfg, test_str.split(" "), 5): t.draw()
def loadPCFG(input_PCFG_filename): ''' Loader function for pcfg args:input_PCFG_filename(str) a location of a file returns:nltk.grammar.PCFG object ''' string = '' with open(input_PCFG_filename,'r') as f: for l in f: string += l return PCFG.fromstring(string)
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. """ # { A -> B : count(A -> B) } productions_counts = defaultdict(int) # { A : count(A) } lhs_count = defaultdict(int) # left_hand_side_count self.start = start # Para la gramatica del parser CKY self.prods = [] # Lista de producciones # Hacemos una copia de t porque al hacer el unlexicalize, este me # modifica el arbol # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents] unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents] for t in unlex_sents: t.chomsky_normal_form(horzMarkov=horzMarkov) t.collapse_unary(collapsePOS=True, collapseRoot=True) for prod in t.productions(): # type(prod): <class 'nltk.grammar.Production'> # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> # Cada elemento de prod.rhs() es del tipo: # <class 'nltk.grammar.Nonterminal'> productions_counts[prod] += 1 lhs_count[prod.lhs()] += 1 for prod, count_prod in productions_counts.items(): # type(production): <class 'nltk.grammar.Production'> # production : A -> B # type(count_prod): int # count_prod : count(A -> B) count_lhs = lhs_count.get(prod.lhs(), 0) # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> q_ML = float(count_prod) / count_lhs self.prods += [ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=q_ML)] # Cada elemento de self.prods es del tipo: # <class 'nltk.grammar.ProbabilisticProduction'> # type(PCFG(...)) = <class 'nltk.grammar.PCFG'> # PCFG(start, productions) # type(start): Nonterminal # type(productions): list(Production) grammar = PCFG(Nonterminal(start), self.prods) self.my_parser = CKYParser(grammar)
def fill_missing_words(grammar: PCFG, missing_words: Set[str]): # UNK -> word1 | word2 | ... | wordN unknown = Nonterminal('UNK') unk_rules = [ Production(unknown, [missing_word]) for missing_word in missing_words ] # Add UNK as a possibility to all rules with strings in the right hand side corrected_rules: List[Nonterminal] = [] rule: ProbabilisticProduction for rule in grammar.productions(): # right hand side has a string somewhere if any(isinstance(element, str) for element in rule.rhs()): # rule has already been corrected if rule.lhs() in corrected_rules: continue unk_rules.append(Production(rule.lhs(), [unknown])) corrected_rules.append(rule.lhs()) return induce_pcfg(grammar.start(), grammar.productions() + unk_rules)
def _learning_by_biclustering(G, C, T): print("learning...") global biclusters global ignore_mc_ec ## find the valid bicluster Bc in T that leads to the maximal posterior gain (Eq.2) BC = None ## 1er essai attempts = 3 while BC is None and attempts > 0: attempts -= 1 BC = _get_best_bicluster(T, C) if BC is None: ignore_mc_ec = True ## 2e essai attempts = 2 while BC is None and attempts > 0: attempts -= 1 BC = _get_best_bicluster(T, C) if BC is None: return False, G, C, T, None ignore_mc_ec = False ## create an AND symbol N and two OR symbols A, B N = Nonterminal("_AND_"+str(_get_and_symb_index())) A = Nonterminal("_OR_"+str(_get_or_symb_index())) B = Nonterminal("_OR_"+str(_get_or_symb_index())) bc = BC.as_matrix() s = np.sum(bc) row_prob = np.sum(bc, 1)/s col_prob = np.sum(bc, 0)/s ## création des règles rules = [] rules += [ProbabilisticProduction(A, [_format_nt(BC.index[i])], prob=row_prob[i]) for i in range(BC.shape[0])] rules += [ProbabilisticProduction(B, [_format_nt(BC.columns[j])], prob=col_prob[j]) for j in range(BC.shape[1])] rules += [ProbabilisticProduction(N, [A, B], prob=1.)] ## mises à jour G_updated = PCFG(G.start(), G.productions() + rules) # ajout des règles dans G C_reduced = _reduce_corpus(C, BC, N) # réduction du corpus T_updated = _create_t(C_reduced) # mise à jour de T biclusters[(N.symbol(),A.symbol(),B.symbol())] = BC # sauvegarde de BC pour le groupe appris return True, G_updated, C_reduced, T_updated, N
def langley_2(depth=5, n=500): G = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [1.0] NP -> Det N [0.5] | Det N RC [0.5] RC -> Rel VP [1.0] Det -> 'the' [0.5] | 'a' [0.5] V -> 'saw' [0.5] | 'heard' [0.5] N -> 'cat' [0.3333] | 'dog' [0.3333] | 'mouse' [0.3333] Rel -> 'that' [1.0] """) C = "" # corpus ## toutes les phrases possibles print("\n") for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1): s = ' '.join(sent) C += s + '. ' print('%3d. %s%s' % (n, s, '.')) return G, C
def baseline(depth=5, n=500): ## symboles non terminaux S = Nonterminal("S") NP = Nonterminal("NP") VP = Nonterminal("VP") PP = Nonterminal("PP") Det = Nonterminal("Det") Vt = Nonterminal("Vt") Vc = Nonterminal("Vc") Vi = Nonterminal("Vi") N = Nonterminal("N") P = Nonterminal("P") ## règles de production probabilistes R = [ ProbabilisticProduction(S, [NP, VP], prob=1.), ProbabilisticProduction(NP, [Det, N], prob=1.), ProbabilisticProduction(VP, [Vt, NP], prob=1 / 3), ProbabilisticProduction(VP, [Vc, PP], prob=1 / 3), ProbabilisticProduction(VP, [Vi], prob=1 / 3), ProbabilisticProduction(PP, [P, NP], prob=1.), ProbabilisticProduction(Det, ["a"], prob=.5), ProbabilisticProduction(Det, ["the"], prob=.5), ProbabilisticProduction(Vt, ["touches"], prob=.5), ProbabilisticProduction(Vt, ["covers"], prob=.5), ProbabilisticProduction(Vi, ["rolls"], prob=.5), ProbabilisticProduction(Vi, ["bounces"], prob=.5), ProbabilisticProduction(Vc, ["is"], prob=1.), ProbabilisticProduction(N, ["circle"], prob=1 / 3), ProbabilisticProduction(N, ["square"], prob=1 / 3), ProbabilisticProduction(N, ["triangle"], prob=1 / 3), ProbabilisticProduction(P, ["above"], prob=.5), ProbabilisticProduction(P, ["below"], prob=.5) ] G = PCFG(S, R) # grammaire C = "" # corpus ## toutes les phrases possibles print("\n") for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1): s = ' '.join(sent) C += s + '. ' print('%3d. %s%s' % (n, s, '.')) return G, C
def langley_1(depth=5, n=500): ## symboles non terminaux S = Nonterminal("S") NP = Nonterminal("NP") VP = Nonterminal("VP") AP = Nonterminal("AP") Adj = Nonterminal("Adj") Det = Nonterminal("Det") Vt = Nonterminal("Vt") Vi = Nonterminal("Vi") N = Nonterminal("N") ## règles de production probabilistes R = [ ProbabilisticProduction(S, [NP, VP], prob=1.), ProbabilisticProduction(VP, [Vi], prob=.5), ProbabilisticProduction(VP, [Vt, NP], prob=.5), ProbabilisticProduction(NP, [Det, N], prob=.5), ProbabilisticProduction(NP, [Det, AP, N], prob=.5), ProbabilisticProduction(AP, [Adj], prob=.5), ProbabilisticProduction(AP, [Adj, AP], prob=.5), ProbabilisticProduction(Det, ["the"], prob=1.), ProbabilisticProduction(Vt, ["saw"], prob=.5), ProbabilisticProduction(Vt, ["heard"], prob=.5), ProbabilisticProduction(Vi, ["ate"], prob=.5), ProbabilisticProduction(Vi, ["slept"], prob=.5), ProbabilisticProduction(N, ["cat"], prob=.5), ProbabilisticProduction(N, ["dog"], prob=.5), ProbabilisticProduction(Adj, ["big"], prob=.5), ProbabilisticProduction(Adj, ["old"], prob=.5) ] G = PCFG(S, R) # grammaire C = "" # corpus ## toutes les phrases possibles print("\n") for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1): s = ' '.join(sent) C += s + '. ' print('%3d. %s%s' % (n, s, '.')) return G, C
def _postprocessing(G, C): print("\npostprocessing...") ## suppression de la règle _START_ -> ... rules = [] for prod in G.productions(): if G.start().symbol() not in prod.lhs().symbol(): rules.append(prod) if len(rules) == 0: return G ## create an OR symbol S S = Nonterminal("_START_") sss = {} # single symbol sentences ## for each sentence s in C do ## if s is fully reduced to a single symbol x then ## add S -> x to G, or if the rule already exists, increase its weight by 1 for sentence in sent_tokenize(C): sentence = re.sub(r'[^\w\s]', '', sentence) t = word_tokenize(sentence) if len(t) == 1: sss[t[0]] = 1 if not t[0] in sss else sss[t[0]] + 1 weight_sum = sum([sss[k] for k in sss]) rules += [ProbabilisticProduction(S, [_format_nt(k)], prob=sss[k]/weight_sum) for k in sss] return PCFG(S, rules)
def test_parse_ambiguity(self): # Ejemplo tomado de las paginas 4, 5, 8 de las notas de Michael Collins # Probabilistic Context-Free Grammars (PCFGs) grammar = PCFG.fromstring(""" S -> NP VP [1.0] VP -> Vt NP [0.65] VP -> VP PP [0.35] NP -> DT NN [0.8] NP -> NP PP [0.2] PP -> IN NP [1.0] Vt -> saw [1.0] NN -> man [0.2] NN -> telescope [0.3] NN -> dog [0.5] DT -> the [1.0] IN -> with [1.0] """) # Cambiando esto: # VP -> Vt NP [0.85] # VP -> VP PP [0.15] # Obtengo el otro arbol parser = CKYParser(grammar) lp, t = parser.parse('the man saw the dog with the telescope'.split()) # draw_trees(t) # check tree t2 = Tree.fromstring(""" (S (NP (DT the) (NN man) ) (VP (VP (Vt saw) (NP (DT the) (NN dog) ) ) (PP (IN with) (NP (DT the) (NN telescope) ) ) ) ) """) self.assertEqual(t, t2) # check log probability lp2 = log2(1.0 * 0.8 * 1.0 * 0.2 * 0.35 * 0.65 * 1.0 * 0.8 * 1.0 * 0.5 * 1.0 * 1.0 * 0.8 * 1.0 * 0.3) self.assertAlmostEqual(lp, lp2)
import nltk from nltk.corpus import treebank from itertools import islice from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2 gram2 = PCFG.fromstring(""" A -> B B [.3] | C B C [.7] B -> B D [.5] | C [.5] C -> 'a' [.1] | 'b' [0.9] D -> 'b' [1.0] """) prod1 = gram2.productions()[0] print(prod1) prod2 = gram2.productions()[1] print(prod2) print(prod2.lhs()) print(prod2.rhs()) print((prod2.prob())) print(gram2.start()) print(gram2.productions())
def test_parse(self): grammar = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det Noun [0.6] NP -> Noun Adj [0.4] VP -> Verb NP [1.0] Det -> 'el' [1.0] Noun -> 'gato' [0.9] Noun -> 'pescado' [0.1] Verb -> 'come' [1.0] Adj -> 'crudo' [1.0] """) parser = CKYParser(grammar) lp, t = parser.parse('el gato come pescado crudo'.split()) # check chart pi = { (1, 1): { 'Det': log2(1.0) }, (2, 2): { 'Noun': log2(0.9) }, (3, 3): { 'Verb': log2(1.0) }, (4, 4): { 'Noun': log2(0.1) }, (5, 5): { 'Adj': log2(1.0) }, (1, 2): { 'NP': log2(0.6 * 1.0 * 0.9) }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': log2(0.4 * 0.1 * 1.0) }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0) }, (1, 4): {}, (2, 5): {}, (1, 5): { 'S': log2(1.0) + # rule S -> NP VP log2(0.6 * 1.0 * 0.9) + # left part log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0) }, # right part } self.assertEqualPi(parser._pi, pi) # check partial results bp = { (1, 1): { 'Det': Tree.fromstring("(Det el)") }, (2, 2): { 'Noun': Tree.fromstring("(Noun gato)") }, (3, 3): { 'Verb': Tree.fromstring("(Verb come)") }, (4, 4): { 'Noun': Tree.fromstring("(Noun pescado)") }, (5, 5): { 'Adj': Tree.fromstring("(Adj crudo)") }, (1, 2): { 'NP': Tree.fromstring("(NP (Det el) (Noun gato))") }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': Tree.fromstring("(NP (Noun pescado) (Adj crudo))") }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': Tree.fromstring( "(VP (Verb come) (NP (Noun pescado) (Adj crudo)))") }, (1, 4): {}, (2, 5): {}, (1, 5): { 'S': Tree.fromstring("""(S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) }, } self.assertEqual(parser._bp, bp) # check tree t2 = Tree.fromstring(""" (S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) self.assertEqual(t, t2) # check log probability lp2 = log2(1.0 * 0.6 * 1.0 * 0.9 * 1.0 * 1.0 * 0.4 * 0.1 * 1.0) self.assertAlmostEqual(lp, lp2)
beta_20 = round(beta/20,4) #se redondea a cuatro decimales print ("beta_20: " + str(beta_20)) #beta_20 y compl_beta_20 deben sumar siempre 0.05 compl_beta_20 = 0.05 - beta_20 print ("compl_beta_20: " + str(compl_beta_20)) grammar = """ S -> A V1 B [0.5] | C W1 D [0.45] | C W2 D [""" + str(beta_20) + """] | A W2 B [""" + str(compl_beta_20) + """] A -> 'a0' [0.1] | 'a1' [0.1] | 'a2' [0.1] | 'a3' [0.1] | 'a4' [0.1] | 'a5' [0.1] | 'a6' [0.1] | 'a7' [0.1] | 'a8' [0.1] | 'a9' [0.1] B -> 'b0' [0.1] | 'b1' [0.1] | 'b2' [0.1] | 'b3' [0.1] | 'b4' [0.1] | 'b5' [0.1] | 'b6' [0.1] | 'b7' [0.1] | 'b8' [0.1] | 'b9' [0.1] C -> 'c0' [0.1] | 'c1' [0.1] | 'c2' [0.1] | 'c3' [0.1] | 'c4' [0.1] | 'c5' [0.1] | 'c6' [0.1] | 'c7' [0.1] | 'c8' [0.1] | 'c9' [0.1] D -> 'd0' [0.1] | 'd1' [0.1] | 'd2' [0.1] | 'd3' [0.1] | 'd4' [0.1] | 'd5' [0.1] | 'd6' [0.1] | 'd7' [0.1] | 'd8' [0.1] | 'd9' [0.1] V1 -> 'v0' [0.02] | 'v1' [0.02] | 'v2' [0.02] | 'v3' [0.02] | 'v4' [0.02] | 'v5' [0.02] | 'v6' [0.02] | 'v7' [0.02] | 'v8' [0.02] | 'v9' [0.02] | 'v10' [0.02] | 'v11' [0.02] | 'v12' [0.02] | 'v13' [0.02] | 'v14' [0.02] | 'v15' [0.02] | 'v16' [0.02] | 'v17' [0.02] | 'v18' [0.02] | 'v19' [0.02] | 'v20' [0.02] | 'v21' [0.02] | 'v22' [0.02] | 'v23' [0.02] | 'v24' [0.02] | 'v25' [0.02] | 'v26' [0.02] | 'v27' [0.02] | 'v28' [0.02] | 'v29' [0.02] | 'v30' [0.02] | 'v31' [0.02] | 'v32' [0.02] | 'v33' [0.02] | 'v34' [0.02] | 'v35' [0.02] | 'v36' [0.02] | 'v37' [0.02] | 'v38' [0.02] | 'v39' [0.02] | 'v40' [0.02] | 'v41' [0.02] | 'v42' [0.02] | 'v43' [0.02] | 'v44' [0.02] | 'v45' [0.02] | 'v46' [0.02] | 'v47' [0.02] | 'v48' [0.02] | 'v49' [0.02] W1 ->'w5' [0.0232] | 'w6' [0.0222] | 'w7' [0.0222] | 'w8' [0.0222] | 'w9' [0.0222] | 'w10' [0.0222] | 'w11' [0.0222] | 'w12' [0.0222] | 'w13' [0.0222] | 'w14' [0.0222] | 'w15' [0.0222] | 'w16' [0.0222] | 'w17' [0.0222] | 'w18' [0.0222] | 'w19' [0.0222] | 'w20' [0.0222] | 'w21' [0.0222] | 'w22' [0.0222] | 'w23' [0.0222] | 'w24' [0.0222] | 'w25' [0.0222] | 'w26' [0.0222] | 'w27' [0.0222] | 'w28' [0.0222] | 'w29' [0.0222] | 'w30' [0.0222] | 'w31' [0.0222] | 'w32' [0.0222] | 'w33' [0.0222] | 'w34' [0.0222] | 'w35' [0.0222] | 'w36' [0.0222] | 'w37' [0.0222] | 'w38' [0.0222] | 'w39' [0.0222] | 'w40' [0.0222] | 'w41' [0.0222] | 'w42' [0.0222] | 'w43' [0.0222] | 'w44' [0.0222] | 'w45' [0.0222] | 'w46' [0.0222] | 'w47' [0.0222] | 'w48' [0.0222] | 'w49' [0.0222] W2 -> 'w0' [0.2] | 'w1' [0.2] | 'w2' [0.2] | 'w3' [0.2] | 'w4' [0.2] """ print(grammar) ambiguity = PCFG.fromstring(grammar) print(ambiguity) archivo_destino = 'ambiguity_alfa_' + str(alfa) + '.txt' with open(archivo_destino, 'w') as f: for sentence in pcfg_generate(ambiguity, n=100000, depth=100): f.write(' '.join(sentence) +'\n')
def demo(choice=None, draw_parses=None, print_parses=None): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import pchart # Define two demos. Each demo has a sentence and a grammar. toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) toy_pcfg2 = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """) demos = [('I saw John with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2)] if choice is None: # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i+1, demos[i][0])) print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') choice = int(sys.stdin.readline().strip())-1 try: sent, grammar = demos[choice] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideChartParser(grammar), pchart.RandomChartParser(grammar), pchart.UnsortedChartParser(grammar), pchart.LongestChartParser(grammar), pchart.InsideChartParser(grammar, beam_size = len(tokens)+1) # was BeamParser ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print('\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)) parser.trace(3) t = time.time() parses = list(parser.parse(tokens)) times.append(time.time()-t) p = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0) average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print(' Parser Beam | Time (secs) # Parses Average P(parse)') print('------------------------+------------------------------------------') for i in range(len(parsers)): print('%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, parsers[i].beam_size, times[i],num_parses[i],average_p[i])) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 print('------------------------+------------------------------------------') print('%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)) if draw_parses is None: # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') draw_parses = sys.stdin.readline().strip().lower().startswith('y') if draw_parses: from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) if print_parses is None: # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') print_parses = sys.stdin.readline().strip().lower().startswith('y') if print_parses: for parse in parses: print(parse)
def test_parse_2(self): grammar = PCFG.fromstring(""" S -> NP VP [1.0] NP -> NP PP [0.5] NP -> Det Noun [0.5] VP -> VP PP [0.9] VP -> Verb NP [0.1] PP -> Prep NP [1.0] Noun -> 'dog' [0.2] Noun -> 'man' [0.2] Noun -> 'town' [0.6] Verb -> 'saw' [1.0] Prep -> 'in' [1.0] Det -> 'the' [1.0] """) parser = CKYParser(grammar) lp, t = parser.parse('the man saw the dog in the town'.split()) # check chart pi = { (1, 1): { 'Det': log2(1.0) }, (2, 2): { 'Noun': log2(0.2) }, (3, 3): { 'Verb': log2(1.0) }, (4, 4): { 'Det': log2(1.0) }, (5, 5): { 'Noun': log2(0.2) }, (6, 6): { 'Prep': log2(1.0) }, (7, 7): { 'Det': log2(1.0) }, (8, 8): { 'Noun': log2(0.6) }, (1, 2): { 'NP': -3.321928094887362 }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': -3.321928094887362 }, (5, 6): {}, (6, 7): {}, (7, 8): { 'NP': -1.736965594166206 }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': -6.643856189774724 }, (4, 6): {}, (5, 7): {}, (6, 8): { 'PP': -1.736965594166206 }, (1, 4): {}, (2, 5): {}, (3, 6): {}, (4, 7): {}, (5, 8): {}, (1, 5): { 'S': -9.965784284662087 }, (2, 6): {}, (3, 7): {}, (4, 8): { 'NP': -6.058893689053567 }, (1, 6): {}, (2, 7): {}, (3, 8): { 'VP': -8.53282487738598 }, (1, 7): {}, (2, 8): {}, (1, 8): { 'S': -11.854752972273342 }, } self.assertEqualPi(parser._pi, pi) bp = { (1, 1): { 'Det': Tree.fromstring('(Det the)') }, (2, 2): { 'Noun': Tree.fromstring('(Noun man)') }, (3, 3): { 'Verb': Tree.fromstring('(Verb saw)') }, (4, 4): { 'Det': Tree.fromstring('(Det the)') }, (5, 5): { 'Noun': Tree.fromstring('(Noun dog)') }, (6, 6): { 'Prep': Tree.fromstring('(Prep in)') }, (7, 7): { 'Det': Tree.fromstring('(Det the)') }, (8, 8): { 'Noun': Tree.fromstring('(Noun town)') }, (1, 2): { 'NP': Tree.fromstring('(NP (Det the) (Noun man))') }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': Tree.fromstring('(NP (Det the) (Noun dog))') }, (5, 6): {}, (6, 7): {}, (7, 8): { 'NP': Tree.fromstring('(NP (Det the) (Noun town))') }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': Tree.fromstring('(VP (Verb saw) (NP (Det the) (Noun dog)))') }, (4, 6): {}, (5, 7): {}, (6, 8): { 'PP': Tree.fromstring('(PP (Prep in) (NP (Det the) (Noun town)))') }, (1, 4): {}, (2, 5): {}, (3, 6): {}, (4, 7): {}, (5, 8): {}, (1, 5): { 'S': Tree.fromstring("""(S (NP (Det the) (Noun man)) (VP (Verb saw) (NP (Det the) (Noun dog))))""") }, (2, 6): {}, (3, 7): {}, (4, 8): { 'NP': Tree.fromstring("""(NP (NP (Det the) (Noun dog)) (PP (Prep in) (NP (Det the) (Noun town))))""") }, (1, 6): {}, (2, 7): {}, (3, 8): { 'VP': Tree.fromstring("""(VP (VP (Verb saw) (NP (Det the) (Noun dog))) (PP (Prep in) (NP (Det the) (Noun town))))""") }, (1, 7): {}, (2, 8): {}, (1, 8): { 'S': Tree.fromstring("""(S (NP (Det the) (Noun man)) (VP (VP (Verb saw) (NP (Det the) (Noun dog))) (PP (Prep in) (NP (Det the) (Noun town)))))""") }, } self.assertEqual(parser._bp, bp)
from nltk.grammar import Nonterminal, PCFG from pcfg_generate import * grammar = """ S -> 'a' V 'b' [0.25] | 'b' V 'a' [0.25] | 'a' W 'a' [0.125] | 'a' W 'b' [0.125] | 'b' W 'a' [0.125] | 'b' W 'b' [0.125] V -> 'v0' [0.2] | 'v1' [0.2] | 'v2' [0.2] | 'v3' [0.2] | 'v4' [0.2] W -> 'w0' [0.2] | 'w1' [0.2] | 'w2' [0.2] | 'w3' [0.2] | 'w4' [0.2] """ print(grammar) nonconflation = PCFG.fromstring(grammar) print(nonconflation) """ texto = "" for sentence in pcfg_generate(nonconflation, n=100000, depth=6): #print(' '.join(sentence)) #print(sentence) str1 = ' '.join(sentence) texto = texto + str1 + '\n' """ with open('nonconflation.txt', 'w') as f: for sentence in pcfg_generate(nonconflation, n=100000, depth=100): f.write(' '.join(sentence) +'\n')
def test_parse(self): grammar = PCFG.fromstring( """ S -> NP VP [1.0] NP -> Det Noun [0.6] NP -> Noun Adj [0.4] VP -> Verb NP [1.0] Det -> 'el' [1.0] Noun -> 'gato' [0.9] Noun -> 'pescado' [0.1] Verb -> 'come' [1.0] Adj -> 'crudo' [1.0] """) parser = CKYParser(grammar) lp, t = parser.parse('el gato come pescado crudo'.split()) # check chart pi = { (1, 1): {'Det': log2(1.0)}, (2, 2): {'Noun': log2(0.9)}, (3, 3): {'Verb': log2(1.0)}, (4, 4): {'Noun': log2(0.1)}, (5, 5): {'Adj': log2(1.0)}, (1, 2): {'NP': log2(0.6 * 1.0 * 0.9)}, (2, 3): {}, (3, 4): {}, (4, 5): {'NP': log2(0.4 * 0.1 * 1.0)}, (1, 3): {}, (2, 4): {}, (3, 5): {'VP': log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)}, (1, 4): {}, (2, 5): {}, (1, 5): {'S': log2(1.0) + # rule S -> NP VP log2(0.6 * 1.0 * 0.9) + # left part log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)}, # right part } self.assertEqualPi(parser._pi, pi) # check partial results bp = { (1, 1): {'Det': Tree.fromstring("(Det el)")}, (2, 2): {'Noun': Tree.fromstring("(Noun gato)")}, (3, 3): {'Verb': Tree.fromstring("(Verb come)")}, (4, 4): {'Noun': Tree.fromstring("(Noun pescado)")}, (5, 5): {'Adj': Tree.fromstring("(Adj crudo)")}, (1, 2): {'NP': Tree.fromstring("(NP (Det el) (Noun gato))")}, (2, 3): {}, (3, 4): {}, (4, 5): {'NP': Tree.fromstring("(NP (Noun pescado) (Adj crudo))")}, (1, 3): {}, (2, 4): {}, (3, 5): {'VP': Tree.fromstring( "(VP (Verb come) (NP (Noun pescado) (Adj crudo)))")}, (1, 4): {}, (2, 5): {}, (1, 5): {'S': Tree.fromstring( """(S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """)}, } self.assertEqual(parser._bp, bp) # check tree t2 = Tree.fromstring( """ (S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) self.assertEqual(t, t2) # check log probability lp2 = log2(1.0 * 0.6 * 1.0 * 0.9 * 1.0 * 1.0 * 0.4 * 0.1 * 1.0) self.assertAlmostEqual(lp, lp2)
def extract_simple_pcfg(n): rules = extract_simple_productions(n) pcfg = grammar.induce_pcfg(Nonterminal("S"), rules) return PCFG(pcfg.start(), sort_rules(pcfg.productions()))
def _attaching(N, G, C, T): print("attaching...") C_derived = _apply_grammar(G, C) ORs = [] # liste des OR (NonTerminal) for prod in G.productions(): nt = prod.lhs() if "OR" in nt.symbol() and nt not in ORs: ORs.append(nt) ## for each OR symbol O in G do for O in ORs: ## if O leads to a valid expanded bicluster ## as well as a posterior gain (Eq.3) larger than a threshold then # # AND-OR group group = None pos = None # gauche ou droite (impair-False ou pair-True) ## récupération du groupe AND-OR de O for g in biclusters: if O.symbol() in g[1] or O.symbol() in g[2]: group = g break ## récupération de la position de O dand le groupe num = int(O.symbol()[4:]) # numéro du OR, ex: "_OR_2" -> 2 pos = True if num % 2 == 0 else False # # BC_tilde et BC_tilde_prime ## création de BC_t (BC_tilde) BC_t = biclusters[group].copy() ## remplissage de BC_t for pair in _get_bicluster_pairs(BC_t): BC_t.at[pair] = _count_occ(" ".join(pair), C_derived) ## création de BC_t_1 (BC_tilde_prime) (proposed new rule OR -> AND) BC_t_1 = BC_t.copy() ## . remplissage de BC_t_1 if pos == False: ## new row (OR à gauche) new_row = [_count_occ(" ".join((N.symbol(),x)), C) for x in BC_t.columns] BC_t_1.loc[N.symbol(),:] = new_row BC_t_1 = BC_t_1.astype(int) else: ## new column (OR à droite) new_col = [_count_occ(" ".join((x,N.symbol())), C) for x in BC_t.index] BC_t_1.loc[:,N.symbol()] = new_col BC_t_1 = BC_t_1.astype(int) # # EC_tilde et EC_tilde_prime ## création et remplissage de EC_t EC_t = _create_ec(BC_t, C_derived, _create_t(C_derived)) ## création de EC_t_1 EC_t_1 = EC_t.copy() ## . ajout des nouvelles lignes de EC_t_1 if pos == False: ## OR à gauche new_row_indices = [(N.symbol(),col) for col in BC_t_1.columns] else: ## OR à droite new_row_indices = [(row,N.symbol()) for row in BC_t_1.index] ## . remplissage des nouvelles lignes de EC_t_1 for i in new_row_indices: i_str = _tuple_to_ec_index(i, True) EC_t_1.loc[i_str,:] = [-1]*EC_t_1.shape[1] for j in EC_t_1.columns: e, c = " ".join(i), list(_ec_index_to_tuple(j, False)) # expression, contexte c = tuple(["" if _represents_int(x) else x for x in c]) EC_t_1.loc[i_str,j] = _count_occ(" ".join([c[0],e,c[1]]).strip(), C) EC_t_1 = EC_t_1.astype(int) bc_t_1 = BC_t_1.as_matrix() ec_t_1 = EC_t_1.as_matrix() bc_t = BC_t.as_matrix() ec_t = EC_t.as_matrix() # # LOG POSTERIOR GAIN DIFFERENCE (Eq.3) ## BC et EC valid (MC) ? if not _is_mc(bc_t_1) and _is_mc(ec_t_1) and _is_mc(bc_t) and _is_mc(ec_t): continue lpg_diff = _log_posterior_gain(bc_t_1, ec_t_1) lpg_diff -= _log_posterior_gain(bc_t, ec_t) if lpg_diff > LPG_DIFF_THRESHOLD: print("new rule: %s -> %s" % (O.symbol(),N.symbol())) bc = BC_t_1.as_matrix() s = np.sum(bc) row_prob = np.sum(bc, 1)/s col_prob = np.sum(bc, 0)/s ## règles rules = [] for prod in G.productions(): if O.symbol() not in prod.lhs().symbol(): rules.append(prod) ## ajout des nouvelles règles if pos == False: ## OR à gauche probs = row_prob rhs_symbols = [x for x in BC_t.index]+[N] for i in range(BC_t_1.shape[0]): rules.append(ProbabilisticProduction(O, [rhs_symbols[i]], prob=probs[i])) else: ## OR à droite probs = col_prob rhs_symbols = [x for x in BC_t.columns]+[N] for j in range(BC_t_1.shape[1]): rules.append(ProbabilisticProduction(O, [rhs_symbols[j]], prob=probs[j])) ## mises à jour biclusters[group] = BC_t_1.copy() # mise à jour du groupe AND-OR G = PCFG(G.start(), rules) # mise à jour de G C = _reduce_corpus(C, biclusters[group], N, True) # réduction de C T = _create_t(C) # mise à jour de T return G, C, T
def loadGrammar(args): with open(args.grammar_file, 'r') as f: pcfg = PCFG.fromstring(f.read()) return pcfg
'Usage: %s grammar_file csv_file skiprows nrows (-1 for all) top_K' % sys.argv[0]) print( 'Example: %s grammar/airbnb_grammar.txt ../data/Airbnb/SanFrancisco_details.csv 0 3 20' % sys.argv[0]) exit(0) grammar_txt = open(sys.argv[1]).read() csv_file = sys.argv[2] skiprows = int(sys.argv[3]) nrows = int(sys.argv[4]) if nrows == -1: nrows = None top_K = int(sys.argv[5]) # print('Grammar:\n' + grammar_txt + '\n') grammar = PCFG.fromstring(grammar_txt) # Read CSV file into Pandas DataFrame # Handle DtypeWarning: Columns (43) have mixed types. [One entry with zipcode '94107-1273'] # Fix dollar sign and thousands separators in 'price' df = pd.read_csv(csv_file, skiprows=skiprows, nrows=nrows, dtype={ 'host_id': np.str, 'zipcode': np.str }, converters={ 'price': lambda s: float(s.replace('$', '').replace(',', '')) })
grammar = PCFG.fromstring(""" S -> negside eqside [0.5] S -> side eqside [0.5] digit -> '0' [0.1] digit -> '1' [0.1] digit -> '2' [0.1] digit -> '3' [0.1] digit -> '4' [0.1] digit -> '5' [0.1] digit -> '6' [0.1] digit -> '7' [0.1] digit -> '8' [0.1] digit -> '9' [0.1] div -> '/' [1.0] divnum -> div number [1.0] divnumvar -> divnum variable [1.0] dot -> '.' [1.0] eq -> '=' [1.0] eqside -> eq negside [0.5] eqside -> eq side [0.5] lparen -> '(' [1.0] minus -> '-' [1.0] minusterm -> minus term [1.0] negside -> minus side [1.0] number -> '0' [.05] number -> '1' [.05] number -> '2' [.05] number -> '3' [.05] number -> '4' [.05] number -> '5' [.05] number -> '6' [.05] number -> '7' [.05] number -> '8' [.05] number -> '9' [.05] number -> digit number [.05] number -> dot number [0.45] parenside -> lparen siderparen [1.0] plus -> '+' [1.0] plusterm -> plus term [1.0] rparen -> ')' [1.0] side -> '0' [.04] side -> '1' [.04] side -> '2' [.04] side -> '3' [.04] side -> '4' [.04] side -> '5' [.04] side -> '6' [.04] side -> '7' [.04] side -> '8' [.04] side -> '9' [.04] side -> 'x' [.04] side -> digit number [.04] side -> dot number [.04] side -> number divnum [.04] side -> number divnumvar [.04] side -> number parenside [.04] side -> number starnum [.04] side -> number vardivnum [.04] side -> number variable [.04] side -> parenside divnum [.04] side -> side minusterm [.04] side -> side plusterm [.16] siderparen -> negside rparen [0.5] siderparen -> side rparen [0.5] star -> '*' [1.0] starnum -> star number [1.0] term -> '0' [.05] term -> '1' [.05] term -> '2' [.05] term -> '3' [.05] term -> '4' [.05] term -> '5' [.05] term -> '6' [.05] term -> '7' [.05] term -> '8' [.05] term -> '9' [.05] term -> 'x' [.05] term -> digit number [.05] term -> dot number [.05] term -> number divnum [.05] term -> number divnumvar [.05] term -> number parenside [.05] term -> number starnum [.05] term -> number vardivnum [.05] term -> number variable [.05] term -> parenside divnum [.05] vardivnum -> variable divnum [1.0] variable -> 'x' [1.0] varstarnum -> variable starnum [1.0] """)
def test_parse_2(self): grammar = PCFG.fromstring( """ S -> NP VP [1.0] NP -> NP PP [0.5] NP -> Det Noun [0.5] VP -> VP PP [0.9] VP -> Verb NP [0.1] PP -> Prep NP [1.0] Noun -> 'dog' [0.2] Noun -> 'man' [0.2] Noun -> 'town' [0.6] Verb -> 'saw' [1.0] Prep -> 'in' [1.0] Det -> 'the' [1.0] """) parser = CKYParser(grammar) lp, t = parser.parse('the man saw the dog in the town'.split()) # check chart pi = { (1, 1): {'Det': log2(1.0)}, (2, 2): {'Noun': log2(0.2)}, (3, 3): {'Verb': log2(1.0)}, (4, 4): {'Det': log2(1.0)}, (5, 5): {'Noun': log2(0.2)}, (6, 6): {'Prep': log2(1.0)}, (7, 7): {'Det': log2(1.0)}, (8, 8): {'Noun': log2(0.6)}, (1, 2): {'NP': -3.321928094887362}, (2, 3): {}, (3, 4): {}, (4, 5): {'NP': -3.321928094887362}, (5, 6): {}, (6, 7): {}, (7, 8): {'NP': -1.736965594166206}, (1, 3): {}, (2, 4): {}, (3, 5): {'VP': -6.643856189774724}, (4, 6): {}, (5, 7): {}, (6, 8): {'PP': -1.736965594166206}, (1, 4): {}, (2, 5): {}, (3, 6): {}, (4, 7): {}, (5, 8): {}, (1, 5): {'S': -9.965784284662087}, (2, 6): {}, (3, 7): {}, (4, 8): {'NP': -6.058893689053567}, (1, 6): {}, (2, 7): {}, (3, 8): {'VP': -8.53282487738598}, (1, 7): {}, (2, 8): {}, (1, 8): {'S': -11.854752972273342}, } self.assertEqualPi(parser._pi, pi) bp = { (1, 1): {'Det': Tree.fromstring('(Det the)')}, (2, 2): {'Noun': Tree.fromstring('(Noun man)')}, (3, 3): {'Verb': Tree.fromstring('(Verb saw)')}, (4, 4): {'Det': Tree.fromstring('(Det the)')}, (5, 5): {'Noun': Tree.fromstring('(Noun dog)')}, (6, 6): {'Prep': Tree.fromstring('(Prep in)')}, (7, 7): {'Det': Tree.fromstring('(Det the)')}, (8, 8): {'Noun': Tree.fromstring('(Noun town)')}, (1, 2): {'NP': Tree.fromstring('(NP (Det the) (Noun man))')}, (2, 3): {}, (3, 4): {}, (4, 5): {'NP': Tree.fromstring('(NP (Det the) (Noun dog))')}, (5, 6): {}, (6, 7): {}, (7, 8): {'NP': Tree.fromstring('(NP (Det the) (Noun town))')}, (1, 3): {}, (2, 4): {}, (3, 5): {'VP': Tree.fromstring( '(VP (Verb saw) (NP (Det the) (Noun dog)))')}, (4, 6): {}, (5, 7): {}, (6, 8): {'PP': Tree.fromstring( '(PP (Prep in) (NP (Det the) (Noun town)))')}, (1, 4): {}, (2, 5): {}, (3, 6): {}, (4, 7): {}, (5, 8): {}, (1, 5): {'S': Tree.fromstring( """(S (NP (Det the) (Noun man)) (VP (Verb saw) (NP (Det the) (Noun dog))))""")}, (2, 6): {}, (3, 7): {}, (4, 8): {'NP': Tree.fromstring( """(NP (NP (Det the) (Noun dog)) (PP (Prep in) (NP (Det the) (Noun town))))""")}, (1, 6): {}, (2, 7): {}, (3, 8): {'VP': Tree.fromstring( """(VP (VP (Verb saw) (NP (Det the) (Noun dog))) (PP (Prep in) (NP (Det the) (Noun town))))""")}, (1, 7): {}, (2, 8): {}, (1, 8): {'S': Tree.fromstring( """(S (NP (Det the) (Noun man)) (VP (VP (Verb saw) (NP (Det the) (Noun dog))) (PP (Prep in) (NP (Det the) (Noun town)))))""")}, } self.assertEqual(parser._bp, bp)
from nltk.grammar import Nonterminal, PCFG from pcfg_generate import * sparseness = PCFG.fromstring(""" S -> A V B [0.5] | C W D [0.5] A -> 'a0' [0.1] | 'a1' [0.1] | 'a2' [0.1] | 'a3' [0.1] | 'a4' [0.1] | 'a5' [0.1] | 'a6' [0.1] | 'a7' [0.1] | 'a8' [0.1] | 'a9' [0.1] B -> 'b0' [0.1] | 'b1' [0.1] | 'b2' [0.1] | 'b3' [0.1] | 'b4' [0.1] | 'b5' [0.1] | 'b6' [0.1] | 'b7' [0.1] | 'b8' [0.1] | 'b9' [0.1] C -> 'c0' [0.1] | 'c1' [0.1] | 'c2' [0.1] | 'c3' [0.1] | 'c4' [0.1] | 'c5' [0.1] | 'c6' [0.1] | 'c7' [0.1] | 'c8' [0.1] | 'c9' [0.1] D -> 'd0' [0.1] | 'd1' [0.1] | 'd2' [0.1] | 'd3' [0.1] | 'd4' [0.1] | 'd5' [0.1] | 'd6' [0.1] | 'd7' [0.1] | 'd8' [0.1] | 'd9' [0.1] V -> 'v0' [0.1] | 'v1' [0.1] | 'v2' [0.1] | 'v3' [0.1] | 'v4' [0.1] | 'v5' [0.1] | 'v6' [0.1] | 'v7' [0.1] | 'v8' [0.1] | 'v9' [0.1] W -> 'w0' [0.1] | 'w1' [0.1] | 'w2' [0.1] | 'w3' [0.1] | 'w4' [0.1] | 'w5' [0.1] | 'w6' [0.1] | 'w7' [0.1] | 'w8' [0.1] | 'w9' [0.1] """) print(sparseness) texto = "" context_ab = "" context_cd = "" for sentence in pcfg_generate(sparseness, n=100000, depth=6): #print(' '.join(sentence)) #print(sentence) str1 = ' '.join(sentence) texto = texto + str1 + '\n' for i in range(10): context_ab = context_ab + 'a' +str(i) + ' ' + 'u' +str(i) + ' ' + 'b' +str(i) + ' ' + '\n' for i in range(10): context_cd = context_cd + 'c' +str(i) + ' ' + 'x' +str(i) + ' ' + 'd' +str(i) + ' ' + '\n' texto = texto + context_ab + context_cd
grammar = PCFG.fromstring(""" S -> side eqside [0.766312] side -> side plusterm [0.215816] side -> side minusterm [0.204728] side -> number variable [0.192571] number -> digit number [0.2953] digit -> '2' [0.20094] number -> '8' [0.0512296] variable -> 'x' [1.0] minusterm -> minus term [1.0] minus -> '-' [1.0] term -> digit number [0.185462] digit -> '6' [0.062679] number -> '0' [0.0918352] plusterm -> plus term [1.0] plus -> '+' [1.0] term -> number divnum [0.205817] number -> '2' [0.114112] divnum -> div number [1.0] div -> '/' [1.0] number -> '5' [0.0972758] eqside -> eq negside [0.402479] eq -> '=' [1.0] negside -> minus side [1.0] digit -> '1' [0.356414] S -> negside eqside [0.233688] side -> number divnum [0.0921555] digit -> '7' [0.0299559] number -> '3' [0.0644647] eqside -> eq side [0.597521] side -> 'x' [0.0759721] side -> '6' [0.00732374] digit -> '9' [0.0175681] number -> '6' [0.0557699] number -> '4' [0.0737897] number -> '7' [0.0733781] digit -> '3' [0.150809] term -> number variable [0.314671] side -> number divnumvar [0.0415636] divnumvar -> divnum variable [1.0] side -> digit number [0.074099] number -> dot number [0.0103797] dot -> '.' [1.0] number -> '1' [0.0461748] term -> 'x' [0.032113] digit -> '8' [0.0327874] digit -> '4' [0.0841404] digit -> '5' [0.0397696] number -> '9' [0.0262901] side -> '1' [0.0088784] side -> number parenside [0.0178505] parenside -> lparen siderparen [1.0] lparen -> '(' [1.0] siderparen -> side rparen [0.841012] term -> '5' [0.0212008] rparen -> ')' [1.0] term -> number parenside [0.0322466] term -> '3' [0.00984322] side -> parenside divnum [0.00844759] term -> parenside divnum [0.00383039] digit -> '0' [0.0249365] side -> '3' [0.00307185] term -> '4' [0.0399964] term -> '9' [0.0138963] term -> '1' [0.0294406] side -> '5' [0.00533828] term -> '8' [0.0189293] side -> '4' [0.00842886] side -> '2' [0.0126808] term -> '2' [0.035008] term -> number divnumvar [0.0251203] side -> '0' [0.0074174] side -> number vardivnum [0.00829774] vardivnum -> variable divnum [1.0] term -> number vardivnum [0.00944237] side -> dot number [0.0017045] side -> number starnum [0.00129243] starnum -> star number [1.0] star -> '*' [1.0] side -> '9' [0.00340901] term -> '7' [0.0130946] siderparen -> negside rparen [0.158988] term -> dot number [0.00218243] term -> number starnum [0.0014698] term -> '0' [0.00080171] side -> '8' [0.00681801] side -> '7' [0.00213531] term -> '6' [0.00543381] """)
def test_ambiguo(self): grammar = PCFG.fromstring(""" S -> NP VP [1.0] VP -> Vt NP [0.3] VP -> VP PP [0.7] NP -> NP PP [0.6] NP -> DT NN [0.4] PP -> IN NP [1.0] Vt -> 'saw' [1.0] NN -> 'man' [0.33] NN -> 'telescope' [0.33] NN -> 'dog' [0.34] DT -> 'the' [1.0] IN -> 'with' [1.0] """) parser = CKYParser(grammar) lp, t = parser.parse('the man saw the dog with the telescope'.split()) pi = { (1, 1): { 'DT': 0.0 }, (2, 2): { 'NN': -1.5994620704162712 }, (3, 3): { 'Vt': 0.0 }, (4, 4): { 'DT': 0.0 }, (5, 5): { 'NN': -1.5563933485243853 }, (6, 6): { 'IN': 0.0 }, (7, 7): { 'DT': 0.0 }, (8, 8): { 'NN': -1.5994620704162712 }, (1, 2): { 'NP': -2.9213901653036336 }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': -2.8783214434117474 }, (5, 6): {}, (6, 7): {}, (7, 8): { 'NP': -2.9213901653036336 }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': -4.6152870375779536 }, (4, 6): {}, (5, 7): {}, (6, 8): { 'PP': -2.9213901653036336 }, (1, 4): {}, (2, 5): {}, (3, 6): {}, (4, 7): {}, (5, 8): {}, (1, 5): { 'S': -7.536677202881587 }, (2, 6): {}, (3, 7): {}, (4, 8): { 'NP': -6.536677202881587 }, (1, 6): {}, (2, 7): {}, (3, 8): { 'VP': -8.051250375711346 }, (1, 7): {}, (2, 8): {}, (1, 8): { 'S': -10.972640541014979 } } self.assertEqualPi(parser._pi, pi) t2 = Tree.fromstring(""" (S (NP (DT the) (NN man)) (VP (VP (Vt saw) (NP (DT the) (NN dog))) (PP (IN with) (NP (DT the) (NN telescope))))) """) self.assertEqual(t, t2)
def demo(choice=None, draw_parses=None, print_parses=None): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import pchart # Define two demos. Each demo has a sentence and a grammar. toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) toy_pcfg2 = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """) demos = [('I saw John with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2)] if choice is None: # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i + 1, demos[i][0])) print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') choice = int(sys.stdin.readline().strip()) - 1 try: sent, grammar = demos[choice] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideChartParser(grammar), pchart.RandomChartParser(grammar), pchart.UnsortedChartParser(grammar), pchart.LongestChartParser(grammar), pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1) # was BeamParser ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print('\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar)) parser.trace(3) t = time.time() parses = list(parser.parse(tokens)) times.append(time.time() - t) p = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print( ' Parser Beam | Time (secs) # Parses Average P(parse)') print( '------------------------+------------------------------------------') for i in range(len(parsers)): print('%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, parsers[i].beam_size, times[i], num_parses[i], average_p[i])) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print( '------------------------+------------------------------------------') print('%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)) if draw_parses is None: # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') draw_parses = sys.stdin.readline().strip().lower().startswith('y') if draw_parses: from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) if print_parses is None: # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') print_parses = sys.stdin.readline().strip().lower().startswith('y') if print_parses: for parse in parses: print(parse)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys import time from nltk import tokenize from nltk.grammar import PCFG from nltk.parse import ViterbiParser toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) toy_pcfg2 = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """) # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw the man with my telescope", toy_pcfg1), ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print(f"{i + 1:>3}: {demos[i][0]}") print(" %r" % demos[i][1]) print() print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print("Bad sentence number") return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print("Time (secs) # Parses Average P(parse)") print("-----------------------------------------") print("%11.4f%11d%19.14f" % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print("------------------------------------------") print("%11s%11d%19.14f" % ("n/a", len(parses), p)) # Ask the user if we should draw the parses. print() print("Draw parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): from nltk.draw.tree import draw_trees print(" please wait...") draw_trees(*parses) # Ask the user if we should print the parses. print() print("Print parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print(parse)