Example #1
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        start -- start symbol.
        horzMarkov -- None for default. A number n >= 0 for horizontal markov.
        """
        self.start = start

        count_Y_Z = defaultdict(lambda: defaultdict(int))
        count_X = defaultdict(int)
        for t in parsed_sents:
            # it's a copy of tree. We don't want to modify the original tree.
            # mutable structures
            unle_trees = unlexicalize(t.copy(deep=True))
            # chomsky normal form with horizontal markov.
            unle_trees.chomsky_normal_form(horzMarkov=horzMarkov)
            # collapse subtrees with a single child.
            unle_trees.collapse_unary(collapsePOS=True)
            for prod in unle_trees.productions():
                count_Y_Z[prod.lhs()][prod.rhs()] += 1
                count_X[prod.lhs()] += 1

        # create a list of productions.
        productions = []
        for X, c_X in count_X.items():
            for (Y_Z, c_Y_Z) in count_Y_Z[X].items():
                q = c_Y_Z / float(c_X)
                productions.append(ProbabilisticProduction(X, Y_Z, prob=q))

        self.production = productions

        grammar = PCFG(Nonterminal(start), productions)
        self.parser = CKYParser(grammar)
Example #2
0
def pcfg_bcl(C, alpha=ALPHA, gd_thr=LPG_DIFF_THRESHOLD, mc_thr=MC_THRESHOLD):
    print("\ninitializing...")
    global ALPHA
    global LPG_DIFF_THRESHOLD
    global MC_THRESHOLD
    global and_symb_count
    global or_symb_count
    global ignore_mc_ec
    ALPHA = alpha
    LPG_DIFF_THRESHOLD = gd_thr
    MC_THRESHOLD = mc_thr
    and_symb_count = 0
    or_symb_count = 0
    ignore_mc_ec = False
    
    ## create an empty grammar G
    S = Nonterminal("_START_")
    R = [ProbabilisticProduction(S, [""], prob=1.)]
    G = PCFG(S, R)
    
    T = _create_t(C) # create a table T
    
    ## repeat until no further rule to be learned
    i = 0
    while not _finished(T):
        i += 1
        print("\niter. n° %d" % (i,))
        found, G, C, T, N = _learning_by_biclustering(G, C, T)
        if not found:
            print("NO MORE RULES CAN BE LEARNED")
            break
        G, C, T = _attaching(N, G, C, T)
    G = _postprocessing(G, C)
    print("\n", G) # DEBUG
    return G
Example #3
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # { A -> B : count(A -> B) }
        productions_counts = defaultdict(int)
        # { A : count(A) }
        lhs_count = defaultdict(int)  # left_hand_side_count

        self.start = start  # Para la gramatica del parser CKY
        self.prods = []  # Lista de producciones

        # Hacemos una copia de t porque al hacer el unlexicalize, este me
        # modifica el arbol
        # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents]
        unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents]

        for t in unlex_sents:
            t.chomsky_normal_form(horzMarkov=horzMarkov)
            t.collapse_unary(collapsePOS=True, collapseRoot=True)
            for prod in t.productions():
                # type(prod): <class 'nltk.grammar.Production'>
                # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
                # type(prod.rhs): <class 'tuple'>
                #   Cada elemento de prod.rhs() es del tipo:
                #       <class 'nltk.grammar.Nonterminal'>
                productions_counts[prod] += 1
                lhs_count[prod.lhs()] += 1

        for prod, count_prod in productions_counts.items():
            # type(production): <class 'nltk.grammar.Production'>
                # production : A -> B
                # type(count_prod): int
            # count_prod : count(A -> B)
            count_lhs = lhs_count.get(prod.lhs(), 0)

            # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
            # type(prod.rhs): <class 'tuple'>
            q_ML = float(count_prod) / count_lhs
            self.prods += [ProbabilisticProduction(prod.lhs(),
                                                   prod.rhs(),
                                                   prob=q_ML)]
            # Cada elemento de self.prods es del tipo:
            #     <class 'nltk.grammar.ProbabilisticProduction'>

        # type(PCFG(...)) = <class 'nltk.grammar.PCFG'>
        # PCFG(start, productions)
        #       type(start): Nonterminal
        #       type(productions): list(Production)
        grammar = PCFG(Nonterminal(start), self.prods)
        self.my_parser = CKYParser(grammar)
Example #4
0
def _learning_by_biclustering(G, C, T):
    print("learning...")
    global biclusters
    global ignore_mc_ec
    
    ## find the valid bicluster Bc in T that leads to the maximal posterior gain (Eq.2)
    BC = None
    
    ## 1er essai
    attempts = 3
    while BC is None and attempts > 0:
        attempts -= 1
        BC = _get_best_bicluster(T, C)
    
    if BC is None:
        ignore_mc_ec = True
    
        ## 2e essai
        attempts = 2
        while BC is None and attempts > 0:
            attempts -= 1
            BC = _get_best_bicluster(T, C)
    
        if BC is None:
            return False, G, C, T, None
        ignore_mc_ec = False
        
    ## create an AND symbol N and two OR symbols A, B
    N = Nonterminal("_AND_"+str(_get_and_symb_index()))
    A = Nonterminal("_OR_"+str(_get_or_symb_index()))
    B = Nonterminal("_OR_"+str(_get_or_symb_index()))
    bc = BC.as_matrix()
    s = np.sum(bc)
    row_prob = np.sum(bc, 1)/s
    col_prob = np.sum(bc, 0)/s
    ## création des règles
    rules = []
    rules += [ProbabilisticProduction(A, [_format_nt(BC.index[i])], prob=row_prob[i])
              for i in range(BC.shape[0])]
    rules += [ProbabilisticProduction(B, [_format_nt(BC.columns[j])], prob=col_prob[j])
              for j in range(BC.shape[1])]
    rules += [ProbabilisticProduction(N, [A, B], prob=1.)]
    ## mises à jour
    G_updated = PCFG(G.start(), G.productions() + rules) # ajout des règles dans G
    C_reduced = _reduce_corpus(C, BC, N) # réduction du corpus
    T_updated = _create_t(C_reduced) # mise à jour de T
    biclusters[(N.symbol(),A.symbol(),B.symbol())] = BC # sauvegarde de BC pour le groupe appris
    return True, G_updated, C_reduced, T_updated, N
Example #5
0
def baseline(depth=5, n=500):
    ## symboles non terminaux
    S = Nonterminal("S")
    NP = Nonterminal("NP")
    VP = Nonterminal("VP")
    PP = Nonterminal("PP")
    Det = Nonterminal("Det")
    Vt = Nonterminal("Vt")
    Vc = Nonterminal("Vc")
    Vi = Nonterminal("Vi")
    N = Nonterminal("N")
    P = Nonterminal("P")
    ## règles de production probabilistes
    R = [
        ProbabilisticProduction(S, [NP, VP], prob=1.),
        ProbabilisticProduction(NP, [Det, N], prob=1.),
        ProbabilisticProduction(VP, [Vt, NP], prob=1 / 3),
        ProbabilisticProduction(VP, [Vc, PP], prob=1 / 3),
        ProbabilisticProduction(VP, [Vi], prob=1 / 3),
        ProbabilisticProduction(PP, [P, NP], prob=1.),
        ProbabilisticProduction(Det, ["a"], prob=.5),
        ProbabilisticProduction(Det, ["the"], prob=.5),
        ProbabilisticProduction(Vt, ["touches"], prob=.5),
        ProbabilisticProduction(Vt, ["covers"], prob=.5),
        ProbabilisticProduction(Vi, ["rolls"], prob=.5),
        ProbabilisticProduction(Vi, ["bounces"], prob=.5),
        ProbabilisticProduction(Vc, ["is"], prob=1.),
        ProbabilisticProduction(N, ["circle"], prob=1 / 3),
        ProbabilisticProduction(N, ["square"], prob=1 / 3),
        ProbabilisticProduction(N, ["triangle"], prob=1 / 3),
        ProbabilisticProduction(P, ["above"], prob=.5),
        ProbabilisticProduction(P, ["below"], prob=.5)
    ]
    G = PCFG(S, R)  # grammaire
    C = ""  # corpus
    ## toutes les phrases possibles
    print("\n")
    for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1):
        s = ' '.join(sent)
        C += s + '. '
        print('%3d. %s%s' % (n, s, '.'))
    return G, C
Example #6
0
def langley_1(depth=5, n=500):
    ## symboles non terminaux
    S = Nonterminal("S")
    NP = Nonterminal("NP")
    VP = Nonterminal("VP")
    AP = Nonterminal("AP")
    Adj = Nonterminal("Adj")
    Det = Nonterminal("Det")
    Vt = Nonterminal("Vt")
    Vi = Nonterminal("Vi")
    N = Nonterminal("N")
    ## règles de production probabilistes
    R = [
        ProbabilisticProduction(S, [NP, VP], prob=1.),
        ProbabilisticProduction(VP, [Vi], prob=.5),
        ProbabilisticProduction(VP, [Vt, NP], prob=.5),
        ProbabilisticProduction(NP, [Det, N], prob=.5),
        ProbabilisticProduction(NP, [Det, AP, N], prob=.5),
        ProbabilisticProduction(AP, [Adj], prob=.5),
        ProbabilisticProduction(AP, [Adj, AP], prob=.5),
        ProbabilisticProduction(Det, ["the"], prob=1.),
        ProbabilisticProduction(Vt, ["saw"], prob=.5),
        ProbabilisticProduction(Vt, ["heard"], prob=.5),
        ProbabilisticProduction(Vi, ["ate"], prob=.5),
        ProbabilisticProduction(Vi, ["slept"], prob=.5),
        ProbabilisticProduction(N, ["cat"], prob=.5),
        ProbabilisticProduction(N, ["dog"], prob=.5),
        ProbabilisticProduction(Adj, ["big"], prob=.5),
        ProbabilisticProduction(Adj, ["old"], prob=.5)
    ]
    G = PCFG(S, R)  # grammaire
    C = ""  # corpus
    ## toutes les phrases possibles
    print("\n")
    for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1):
        s = ' '.join(sent)
        C += s + '. '
        print('%3d. %s%s' % (n, s, '.'))
    return G, C
Example #7
0
def _postprocessing(G, C):
    print("\npostprocessing...")
    ## suppression de la règle _START_ -> ...
    rules = []
    for prod in G.productions():
        if G.start().symbol() not in prod.lhs().symbol():
            rules.append(prod)
    if len(rules) == 0:
        return G
    ## create an OR symbol S
    S = Nonterminal("_START_")
    sss = {} # single symbol sentences
    ## for each sentence s in C do
    ##   if s is fully reduced to a single symbol x then
    ##   add S -> x to G, or if the rule already exists, increase its weight by 1
    for sentence in sent_tokenize(C):
        sentence = re.sub(r'[^\w\s]', '', sentence)
        t = word_tokenize(sentence)
        if len(t) == 1:
            sss[t[0]] = 1 if not t[0] in sss else sss[t[0]] + 1
    weight_sum = sum([sss[k] for k in sss])
    rules += [ProbabilisticProduction(S, [_format_nt(k)], prob=sss[k]/weight_sum) for k in sss]
    return PCFG(S, rules)
Example #8
0
def _attaching(N, G, C, T):
    print("attaching...")
    C_derived = _apply_grammar(G, C)
    ORs = [] # liste des OR (NonTerminal)
    for prod in G.productions():
        nt = prod.lhs()
        if "OR" in nt.symbol() and nt not in ORs:
            ORs.append(nt)
    ## for each OR symbol O in G do
    for O in ORs:
        ## if O leads to a valid expanded bicluster
        ## as well as a posterior gain (Eq.3) larger than a threshold then
        
        #
        #   AND-OR group
        
        group = None
        pos = None # gauche ou droite (impair-False ou pair-True)
        ## récupération du groupe AND-OR de O
        for g in biclusters:
            if O.symbol() in g[1] or O.symbol() in g[2]:
                group = g
                break
        ## récupération de la position de O dand le groupe
        num = int(O.symbol()[4:]) # numéro du OR, ex: "_OR_2" -> 2
        pos = True if num % 2 == 0 else False
        
        #
        #   BC_tilde et BC_tilde_prime
        
        ## création de BC_t (BC_tilde)
        BC_t = biclusters[group].copy()
        ## remplissage de BC_t
        for pair in _get_bicluster_pairs(BC_t):
            BC_t.at[pair] = _count_occ(" ".join(pair), C_derived)
        ## création de BC_t_1 (BC_tilde_prime) (proposed new rule OR -> AND)
        BC_t_1 = BC_t.copy()
        ## . remplissage de BC_t_1
        if pos == False:
            ## new row (OR à gauche)
            new_row = [_count_occ(" ".join((N.symbol(),x)), C) for x in BC_t.columns]
            BC_t_1.loc[N.symbol(),:] = new_row
            BC_t_1 = BC_t_1.astype(int)
        else:
            ## new column (OR à droite)
            new_col = [_count_occ(" ".join((x,N.symbol())), C) for x in BC_t.index]
            BC_t_1.loc[:,N.symbol()] = new_col
            BC_t_1 = BC_t_1.astype(int)
        
        #
        #   EC_tilde et EC_tilde_prime

        ## création et remplissage de EC_t
        EC_t = _create_ec(BC_t, C_derived, _create_t(C_derived))
        ## création de EC_t_1
        EC_t_1 = EC_t.copy()
        ## . ajout des nouvelles lignes de EC_t_1
        if pos == False:
            ## OR à gauche
            new_row_indices = [(N.symbol(),col) for col in BC_t_1.columns]
        else:
            ## OR à droite
            new_row_indices = [(row,N.symbol()) for row in BC_t_1.index]
        ## . remplissage des nouvelles lignes de EC_t_1
        for i in new_row_indices:
            i_str = _tuple_to_ec_index(i, True)
            EC_t_1.loc[i_str,:] = [-1]*EC_t_1.shape[1]
            for j in EC_t_1.columns:
                e, c = " ".join(i), list(_ec_index_to_tuple(j, False)) # expression, contexte
                c = tuple(["" if _represents_int(x) else x for x in c])
                EC_t_1.loc[i_str,j] = _count_occ(" ".join([c[0],e,c[1]]).strip(), C)
        EC_t_1 = EC_t_1.astype(int)
        bc_t_1 = BC_t_1.as_matrix()
        ec_t_1 = EC_t_1.as_matrix()
        bc_t = BC_t.as_matrix()
        ec_t = EC_t.as_matrix()
        
        #
        #   LOG POSTERIOR GAIN DIFFERENCE (Eq.3)
        
        ## BC et EC valid (MC) ?
        if not _is_mc(bc_t_1) and _is_mc(ec_t_1) and _is_mc(bc_t) and _is_mc(ec_t):
            continue
        
        lpg_diff = _log_posterior_gain(bc_t_1, ec_t_1)
        lpg_diff -= _log_posterior_gain(bc_t, ec_t)
        
        if lpg_diff > LPG_DIFF_THRESHOLD:
            print("new rule: %s -> %s" % (O.symbol(),N.symbol()))
            bc = BC_t_1.as_matrix()
            s = np.sum(bc)
            row_prob = np.sum(bc, 1)/s
            col_prob = np.sum(bc, 0)/s
            ## règles
            rules = []
            for prod in G.productions():
                if O.symbol() not in prod.lhs().symbol():
                    rules.append(prod)
            ## ajout des nouvelles règles
            if pos == False:
                ## OR à gauche
                probs = row_prob
                rhs_symbols = [x for x in BC_t.index]+[N]
                for i in range(BC_t_1.shape[0]):
                    rules.append(ProbabilisticProduction(O, [rhs_symbols[i]], prob=probs[i]))
            else:
                ## OR à droite
                probs = col_prob
                rhs_symbols = [x for x in BC_t.columns]+[N]
                for j in range(BC_t_1.shape[1]):
                    rules.append(ProbabilisticProduction(O, [rhs_symbols[j]], prob=probs[j]))
                
            ## mises à jour
            biclusters[group] = BC_t_1.copy() # mise à jour du groupe AND-OR
            G = PCFG(G.start(), rules) # mise à jour de G
            C = _reduce_corpus(C, biclusters[group], N, True) # réduction de C
            T = _create_t(C) # mise à jour de T
            
    return G, C, T
Example #9
0
def extract_simple_pcfg(n):
    rules = extract_simple_productions(n)
    pcfg = grammar.induce_pcfg(Nonterminal("S"), rules)
    return PCFG(pcfg.start(), sort_rules(pcfg.productions()))