def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. start -- start symbol. horzMarkov -- None for default. A number n >= 0 for horizontal markov. """ self.start = start count_Y_Z = defaultdict(lambda: defaultdict(int)) count_X = defaultdict(int) for t in parsed_sents: # it's a copy of tree. We don't want to modify the original tree. # mutable structures unle_trees = unlexicalize(t.copy(deep=True)) # chomsky normal form with horizontal markov. unle_trees.chomsky_normal_form(horzMarkov=horzMarkov) # collapse subtrees with a single child. unle_trees.collapse_unary(collapsePOS=True) for prod in unle_trees.productions(): count_Y_Z[prod.lhs()][prod.rhs()] += 1 count_X[prod.lhs()] += 1 # create a list of productions. productions = [] for X, c_X in count_X.items(): for (Y_Z, c_Y_Z) in count_Y_Z[X].items(): q = c_Y_Z / float(c_X) productions.append(ProbabilisticProduction(X, Y_Z, prob=q)) self.production = productions grammar = PCFG(Nonterminal(start), productions) self.parser = CKYParser(grammar)
def pcfg_bcl(C, alpha=ALPHA, gd_thr=LPG_DIFF_THRESHOLD, mc_thr=MC_THRESHOLD): print("\ninitializing...") global ALPHA global LPG_DIFF_THRESHOLD global MC_THRESHOLD global and_symb_count global or_symb_count global ignore_mc_ec ALPHA = alpha LPG_DIFF_THRESHOLD = gd_thr MC_THRESHOLD = mc_thr and_symb_count = 0 or_symb_count = 0 ignore_mc_ec = False ## create an empty grammar G S = Nonterminal("_START_") R = [ProbabilisticProduction(S, [""], prob=1.)] G = PCFG(S, R) T = _create_t(C) # create a table T ## repeat until no further rule to be learned i = 0 while not _finished(T): i += 1 print("\niter. n° %d" % (i,)) found, G, C, T, N = _learning_by_biclustering(G, C, T) if not found: print("NO MORE RULES CAN BE LEARNED") break G, C, T = _attaching(N, G, C, T) G = _postprocessing(G, C) print("\n", G) # DEBUG return G
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. """ # { A -> B : count(A -> B) } productions_counts = defaultdict(int) # { A : count(A) } lhs_count = defaultdict(int) # left_hand_side_count self.start = start # Para la gramatica del parser CKY self.prods = [] # Lista de producciones # Hacemos una copia de t porque al hacer el unlexicalize, este me # modifica el arbol # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents] unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents] for t in unlex_sents: t.chomsky_normal_form(horzMarkov=horzMarkov) t.collapse_unary(collapsePOS=True, collapseRoot=True) for prod in t.productions(): # type(prod): <class 'nltk.grammar.Production'> # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> # Cada elemento de prod.rhs() es del tipo: # <class 'nltk.grammar.Nonterminal'> productions_counts[prod] += 1 lhs_count[prod.lhs()] += 1 for prod, count_prod in productions_counts.items(): # type(production): <class 'nltk.grammar.Production'> # production : A -> B # type(count_prod): int # count_prod : count(A -> B) count_lhs = lhs_count.get(prod.lhs(), 0) # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> q_ML = float(count_prod) / count_lhs self.prods += [ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=q_ML)] # Cada elemento de self.prods es del tipo: # <class 'nltk.grammar.ProbabilisticProduction'> # type(PCFG(...)) = <class 'nltk.grammar.PCFG'> # PCFG(start, productions) # type(start): Nonterminal # type(productions): list(Production) grammar = PCFG(Nonterminal(start), self.prods) self.my_parser = CKYParser(grammar)
def _learning_by_biclustering(G, C, T): print("learning...") global biclusters global ignore_mc_ec ## find the valid bicluster Bc in T that leads to the maximal posterior gain (Eq.2) BC = None ## 1er essai attempts = 3 while BC is None and attempts > 0: attempts -= 1 BC = _get_best_bicluster(T, C) if BC is None: ignore_mc_ec = True ## 2e essai attempts = 2 while BC is None and attempts > 0: attempts -= 1 BC = _get_best_bicluster(T, C) if BC is None: return False, G, C, T, None ignore_mc_ec = False ## create an AND symbol N and two OR symbols A, B N = Nonterminal("_AND_"+str(_get_and_symb_index())) A = Nonterminal("_OR_"+str(_get_or_symb_index())) B = Nonterminal("_OR_"+str(_get_or_symb_index())) bc = BC.as_matrix() s = np.sum(bc) row_prob = np.sum(bc, 1)/s col_prob = np.sum(bc, 0)/s ## création des règles rules = [] rules += [ProbabilisticProduction(A, [_format_nt(BC.index[i])], prob=row_prob[i]) for i in range(BC.shape[0])] rules += [ProbabilisticProduction(B, [_format_nt(BC.columns[j])], prob=col_prob[j]) for j in range(BC.shape[1])] rules += [ProbabilisticProduction(N, [A, B], prob=1.)] ## mises à jour G_updated = PCFG(G.start(), G.productions() + rules) # ajout des règles dans G C_reduced = _reduce_corpus(C, BC, N) # réduction du corpus T_updated = _create_t(C_reduced) # mise à jour de T biclusters[(N.symbol(),A.symbol(),B.symbol())] = BC # sauvegarde de BC pour le groupe appris return True, G_updated, C_reduced, T_updated, N
def baseline(depth=5, n=500): ## symboles non terminaux S = Nonterminal("S") NP = Nonterminal("NP") VP = Nonterminal("VP") PP = Nonterminal("PP") Det = Nonterminal("Det") Vt = Nonterminal("Vt") Vc = Nonterminal("Vc") Vi = Nonterminal("Vi") N = Nonterminal("N") P = Nonterminal("P") ## règles de production probabilistes R = [ ProbabilisticProduction(S, [NP, VP], prob=1.), ProbabilisticProduction(NP, [Det, N], prob=1.), ProbabilisticProduction(VP, [Vt, NP], prob=1 / 3), ProbabilisticProduction(VP, [Vc, PP], prob=1 / 3), ProbabilisticProduction(VP, [Vi], prob=1 / 3), ProbabilisticProduction(PP, [P, NP], prob=1.), ProbabilisticProduction(Det, ["a"], prob=.5), ProbabilisticProduction(Det, ["the"], prob=.5), ProbabilisticProduction(Vt, ["touches"], prob=.5), ProbabilisticProduction(Vt, ["covers"], prob=.5), ProbabilisticProduction(Vi, ["rolls"], prob=.5), ProbabilisticProduction(Vi, ["bounces"], prob=.5), ProbabilisticProduction(Vc, ["is"], prob=1.), ProbabilisticProduction(N, ["circle"], prob=1 / 3), ProbabilisticProduction(N, ["square"], prob=1 / 3), ProbabilisticProduction(N, ["triangle"], prob=1 / 3), ProbabilisticProduction(P, ["above"], prob=.5), ProbabilisticProduction(P, ["below"], prob=.5) ] G = PCFG(S, R) # grammaire C = "" # corpus ## toutes les phrases possibles print("\n") for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1): s = ' '.join(sent) C += s + '. ' print('%3d. %s%s' % (n, s, '.')) return G, C
def langley_1(depth=5, n=500): ## symboles non terminaux S = Nonterminal("S") NP = Nonterminal("NP") VP = Nonterminal("VP") AP = Nonterminal("AP") Adj = Nonterminal("Adj") Det = Nonterminal("Det") Vt = Nonterminal("Vt") Vi = Nonterminal("Vi") N = Nonterminal("N") ## règles de production probabilistes R = [ ProbabilisticProduction(S, [NP, VP], prob=1.), ProbabilisticProduction(VP, [Vi], prob=.5), ProbabilisticProduction(VP, [Vt, NP], prob=.5), ProbabilisticProduction(NP, [Det, N], prob=.5), ProbabilisticProduction(NP, [Det, AP, N], prob=.5), ProbabilisticProduction(AP, [Adj], prob=.5), ProbabilisticProduction(AP, [Adj, AP], prob=.5), ProbabilisticProduction(Det, ["the"], prob=1.), ProbabilisticProduction(Vt, ["saw"], prob=.5), ProbabilisticProduction(Vt, ["heard"], prob=.5), ProbabilisticProduction(Vi, ["ate"], prob=.5), ProbabilisticProduction(Vi, ["slept"], prob=.5), ProbabilisticProduction(N, ["cat"], prob=.5), ProbabilisticProduction(N, ["dog"], prob=.5), ProbabilisticProduction(Adj, ["big"], prob=.5), ProbabilisticProduction(Adj, ["old"], prob=.5) ] G = PCFG(S, R) # grammaire C = "" # corpus ## toutes les phrases possibles print("\n") for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1): s = ' '.join(sent) C += s + '. ' print('%3d. %s%s' % (n, s, '.')) return G, C
def _postprocessing(G, C): print("\npostprocessing...") ## suppression de la règle _START_ -> ... rules = [] for prod in G.productions(): if G.start().symbol() not in prod.lhs().symbol(): rules.append(prod) if len(rules) == 0: return G ## create an OR symbol S S = Nonterminal("_START_") sss = {} # single symbol sentences ## for each sentence s in C do ## if s is fully reduced to a single symbol x then ## add S -> x to G, or if the rule already exists, increase its weight by 1 for sentence in sent_tokenize(C): sentence = re.sub(r'[^\w\s]', '', sentence) t = word_tokenize(sentence) if len(t) == 1: sss[t[0]] = 1 if not t[0] in sss else sss[t[0]] + 1 weight_sum = sum([sss[k] for k in sss]) rules += [ProbabilisticProduction(S, [_format_nt(k)], prob=sss[k]/weight_sum) for k in sss] return PCFG(S, rules)
def _attaching(N, G, C, T): print("attaching...") C_derived = _apply_grammar(G, C) ORs = [] # liste des OR (NonTerminal) for prod in G.productions(): nt = prod.lhs() if "OR" in nt.symbol() and nt not in ORs: ORs.append(nt) ## for each OR symbol O in G do for O in ORs: ## if O leads to a valid expanded bicluster ## as well as a posterior gain (Eq.3) larger than a threshold then # # AND-OR group group = None pos = None # gauche ou droite (impair-False ou pair-True) ## récupération du groupe AND-OR de O for g in biclusters: if O.symbol() in g[1] or O.symbol() in g[2]: group = g break ## récupération de la position de O dand le groupe num = int(O.symbol()[4:]) # numéro du OR, ex: "_OR_2" -> 2 pos = True if num % 2 == 0 else False # # BC_tilde et BC_tilde_prime ## création de BC_t (BC_tilde) BC_t = biclusters[group].copy() ## remplissage de BC_t for pair in _get_bicluster_pairs(BC_t): BC_t.at[pair] = _count_occ(" ".join(pair), C_derived) ## création de BC_t_1 (BC_tilde_prime) (proposed new rule OR -> AND) BC_t_1 = BC_t.copy() ## . remplissage de BC_t_1 if pos == False: ## new row (OR à gauche) new_row = [_count_occ(" ".join((N.symbol(),x)), C) for x in BC_t.columns] BC_t_1.loc[N.symbol(),:] = new_row BC_t_1 = BC_t_1.astype(int) else: ## new column (OR à droite) new_col = [_count_occ(" ".join((x,N.symbol())), C) for x in BC_t.index] BC_t_1.loc[:,N.symbol()] = new_col BC_t_1 = BC_t_1.astype(int) # # EC_tilde et EC_tilde_prime ## création et remplissage de EC_t EC_t = _create_ec(BC_t, C_derived, _create_t(C_derived)) ## création de EC_t_1 EC_t_1 = EC_t.copy() ## . ajout des nouvelles lignes de EC_t_1 if pos == False: ## OR à gauche new_row_indices = [(N.symbol(),col) for col in BC_t_1.columns] else: ## OR à droite new_row_indices = [(row,N.symbol()) for row in BC_t_1.index] ## . remplissage des nouvelles lignes de EC_t_1 for i in new_row_indices: i_str = _tuple_to_ec_index(i, True) EC_t_1.loc[i_str,:] = [-1]*EC_t_1.shape[1] for j in EC_t_1.columns: e, c = " ".join(i), list(_ec_index_to_tuple(j, False)) # expression, contexte c = tuple(["" if _represents_int(x) else x for x in c]) EC_t_1.loc[i_str,j] = _count_occ(" ".join([c[0],e,c[1]]).strip(), C) EC_t_1 = EC_t_1.astype(int) bc_t_1 = BC_t_1.as_matrix() ec_t_1 = EC_t_1.as_matrix() bc_t = BC_t.as_matrix() ec_t = EC_t.as_matrix() # # LOG POSTERIOR GAIN DIFFERENCE (Eq.3) ## BC et EC valid (MC) ? if not _is_mc(bc_t_1) and _is_mc(ec_t_1) and _is_mc(bc_t) and _is_mc(ec_t): continue lpg_diff = _log_posterior_gain(bc_t_1, ec_t_1) lpg_diff -= _log_posterior_gain(bc_t, ec_t) if lpg_diff > LPG_DIFF_THRESHOLD: print("new rule: %s -> %s" % (O.symbol(),N.symbol())) bc = BC_t_1.as_matrix() s = np.sum(bc) row_prob = np.sum(bc, 1)/s col_prob = np.sum(bc, 0)/s ## règles rules = [] for prod in G.productions(): if O.symbol() not in prod.lhs().symbol(): rules.append(prod) ## ajout des nouvelles règles if pos == False: ## OR à gauche probs = row_prob rhs_symbols = [x for x in BC_t.index]+[N] for i in range(BC_t_1.shape[0]): rules.append(ProbabilisticProduction(O, [rhs_symbols[i]], prob=probs[i])) else: ## OR à droite probs = col_prob rhs_symbols = [x for x in BC_t.columns]+[N] for j in range(BC_t_1.shape[1]): rules.append(ProbabilisticProduction(O, [rhs_symbols[j]], prob=probs[j])) ## mises à jour biclusters[group] = BC_t_1.copy() # mise à jour du groupe AND-OR G = PCFG(G.start(), rules) # mise à jour de G C = _reduce_corpus(C, biclusters[group], N, True) # réduction de C T = _create_t(C) # mise à jour de T return G, C, T
def extract_simple_pcfg(n): rules = extract_simple_productions(n) pcfg = grammar.induce_pcfg(Nonterminal("S"), rules) return PCFG(pcfg.start(), sort_rules(pcfg.productions()))