Example #1
0
def fill_missing_words(grammar: PCFG, missing_words: Set[str]):
    # UNK -> word1 | word2 | ... | wordN
    unknown = Nonterminal('UNK')
    unk_rules = [
        Production(unknown, [missing_word]) for missing_word in missing_words
    ]

    # Add UNK as a possibility to all rules with strings in the right hand side
    corrected_rules: List[Nonterminal] = []
    rule: ProbabilisticProduction
    for rule in grammar.productions():

        # right hand side has a string somewhere
        if any(isinstance(element, str) for element in rule.rhs()):

            # rule has already been corrected
            if rule.lhs() in corrected_rules:
                continue

            unk_rules.append(Production(rule.lhs(), [unknown]))

            corrected_rules.append(rule.lhs())

    return induce_pcfg(grammar.start(), grammar.productions() + unk_rules)
Example #2
0
def _attaching(N, G, C, T):
    print("attaching...")
    C_derived = _apply_grammar(G, C)
    ORs = [] # liste des OR (NonTerminal)
    for prod in G.productions():
        nt = prod.lhs()
        if "OR" in nt.symbol() and nt not in ORs:
            ORs.append(nt)
    ## for each OR symbol O in G do
    for O in ORs:
        ## if O leads to a valid expanded bicluster
        ## as well as a posterior gain (Eq.3) larger than a threshold then
        
        #
        #   AND-OR group
        
        group = None
        pos = None # gauche ou droite (impair-False ou pair-True)
        ## récupération du groupe AND-OR de O
        for g in biclusters:
            if O.symbol() in g[1] or O.symbol() in g[2]:
                group = g
                break
        ## récupération de la position de O dand le groupe
        num = int(O.symbol()[4:]) # numéro du OR, ex: "_OR_2" -> 2
        pos = True if num % 2 == 0 else False
        
        #
        #   BC_tilde et BC_tilde_prime
        
        ## création de BC_t (BC_tilde)
        BC_t = biclusters[group].copy()
        ## remplissage de BC_t
        for pair in _get_bicluster_pairs(BC_t):
            BC_t.at[pair] = _count_occ(" ".join(pair), C_derived)
        ## création de BC_t_1 (BC_tilde_prime) (proposed new rule OR -> AND)
        BC_t_1 = BC_t.copy()
        ## . remplissage de BC_t_1
        if pos == False:
            ## new row (OR à gauche)
            new_row = [_count_occ(" ".join((N.symbol(),x)), C) for x in BC_t.columns]
            BC_t_1.loc[N.symbol(),:] = new_row
            BC_t_1 = BC_t_1.astype(int)
        else:
            ## new column (OR à droite)
            new_col = [_count_occ(" ".join((x,N.symbol())), C) for x in BC_t.index]
            BC_t_1.loc[:,N.symbol()] = new_col
            BC_t_1 = BC_t_1.astype(int)
        
        #
        #   EC_tilde et EC_tilde_prime

        ## création et remplissage de EC_t
        EC_t = _create_ec(BC_t, C_derived, _create_t(C_derived))
        ## création de EC_t_1
        EC_t_1 = EC_t.copy()
        ## . ajout des nouvelles lignes de EC_t_1
        if pos == False:
            ## OR à gauche
            new_row_indices = [(N.symbol(),col) for col in BC_t_1.columns]
        else:
            ## OR à droite
            new_row_indices = [(row,N.symbol()) for row in BC_t_1.index]
        ## . remplissage des nouvelles lignes de EC_t_1
        for i in new_row_indices:
            i_str = _tuple_to_ec_index(i, True)
            EC_t_1.loc[i_str,:] = [-1]*EC_t_1.shape[1]
            for j in EC_t_1.columns:
                e, c = " ".join(i), list(_ec_index_to_tuple(j, False)) # expression, contexte
                c = tuple(["" if _represents_int(x) else x for x in c])
                EC_t_1.loc[i_str,j] = _count_occ(" ".join([c[0],e,c[1]]).strip(), C)
        EC_t_1 = EC_t_1.astype(int)
        bc_t_1 = BC_t_1.as_matrix()
        ec_t_1 = EC_t_1.as_matrix()
        bc_t = BC_t.as_matrix()
        ec_t = EC_t.as_matrix()
        
        #
        #   LOG POSTERIOR GAIN DIFFERENCE (Eq.3)
        
        ## BC et EC valid (MC) ?
        if not _is_mc(bc_t_1) and _is_mc(ec_t_1) and _is_mc(bc_t) and _is_mc(ec_t):
            continue
        
        lpg_diff = _log_posterior_gain(bc_t_1, ec_t_1)
        lpg_diff -= _log_posterior_gain(bc_t, ec_t)
        
        if lpg_diff > LPG_DIFF_THRESHOLD:
            print("new rule: %s -> %s" % (O.symbol(),N.symbol()))
            bc = BC_t_1.as_matrix()
            s = np.sum(bc)
            row_prob = np.sum(bc, 1)/s
            col_prob = np.sum(bc, 0)/s
            ## règles
            rules = []
            for prod in G.productions():
                if O.symbol() not in prod.lhs().symbol():
                    rules.append(prod)
            ## ajout des nouvelles règles
            if pos == False:
                ## OR à gauche
                probs = row_prob
                rhs_symbols = [x for x in BC_t.index]+[N]
                for i in range(BC_t_1.shape[0]):
                    rules.append(ProbabilisticProduction(O, [rhs_symbols[i]], prob=probs[i]))
            else:
                ## OR à droite
                probs = col_prob
                rhs_symbols = [x for x in BC_t.columns]+[N]
                for j in range(BC_t_1.shape[1]):
                    rules.append(ProbabilisticProduction(O, [rhs_symbols[j]], prob=probs[j]))
                
            ## mises à jour
            biclusters[group] = BC_t_1.copy() # mise à jour du groupe AND-OR
            G = PCFG(G.start(), rules) # mise à jour de G
            C = _reduce_corpus(C, biclusters[group], N, True) # réduction de C
            T = _create_t(C) # mise à jour de T
            
    return G, C, T