Esempio n. 1
0
def marginal_likelihood(e_corpus: Corpus, f_corpus: Corpus, model: ConditionalModel):

    PL, PM, PAj, PFj = model.components
    ll = 0.0
    for e_snt, f_snt in zip(e_corpus.itersentences(), f_corpus.itersentences()):
        # observations
        l = e_snt.shape[0]
        m = f_snt.shape[0]
        log_pl = np.log(PL.generate(l))
        log_pm = np.log(PM.generate(m))

        # P(f|e) = \prod_j P(f_j|e)
        #          = \prod_j \sum_i P(f_j,a_j=i|e)
        log_pf_e = 0.0
        for j, f in enumerate(f_snt):
            # P(f_j|e) = \sum_i P(f_j,a_j=i|e)
            pfj_e = 0.0  # contribution of this French word
            for i, e in enumerate(e_snt):
                # P(f_j, a_j=i | e) = P(a_j=i) P(f_j|e_i, l, m)
                pfj_e += PAj.generate((j, i), e_snt, 0, l, m) * PFj.generate((j, f), (j, i), e_snt, 0, l, m)
            # P(f|z,e) = \prod_j P(f_j|z,e)
            log_pf_e += np.log(pfj_e)
        # \sum_{f,e} P(l)P(m)P(f|e,l,m)
        ll += log_pl + log_pm + log_pf_e
    return - ll / e_corpus.n_sentences()
Esempio n. 2
0
def marginal_likelihood(e_corpus: Corpus, f_corpus: Corpus, model: JointModel):

    PL, PM, PZ, PEi, PAj, PFj = model.components
    n_clusters = PZ.n_clusters
    ll = 0.0
    for e_snt, f_snt in zip(e_corpus.itersentences(),
                            f_corpus.itersentences()):
        # observations
        l = e_snt.shape[0]
        m = f_snt.shape[0]
        log_pl = np.log(PL.generate(l))
        log_pm = np.log(PM.generate(m))
        # 0-order alignments
        # P(f,e) = \sum_z P(z) P(e|z) P(f|z,e)
        log_pfe = -np.inf  # contribution of this sentence
        for z in range(n_clusters):
            # contribution of the cluster
            log_pz = np.log(PZ.generate(z, l, m))
            # compute the contribution of the entire English sentence
            log_pe_z = 0.0
            # P(e|z) = \prod_i P(e_i|z)
            for i, e in enumerate(e_snt):
                log_pe_z += np.log(PEi.generate((i, e), z, l, m))

            # P(f|z,e) = \prod_j P(f_j|z,e)
            #          = \prod_j \sum_i P(f_j,a_j=i|z,e)
            log_pf_ze = 0.0
            for j, f in enumerate(f_snt):
                # P(f_j|z,e) = \sum_i P(f_j,a_j=i|z,e)
                pfj_ze = 0.0  # contribution of this French word
                for i, e in enumerate(e_snt):
                    pfj_ze += PAj.generate(
                        (j, i), e_snt, z, l, m) * PFj.generate(
                            (j, f), (j, i), e_snt, z, l, m)
                # P(f|z,e) = \prod_j P(f_j|z,e)
                log_pf_ze += np.log(pfj_ze)
            # \sum_z P(z) P(e|z) P(f|z,e)
            log_pfe = np.logaddexp(log_pfe, log_pz + log_pe_z + log_pf_ze)
        # \sum_{f,e} P(l)P(m)P(f,e|l,m)
        ll += log_pl + log_pm + log_pfe
    return -ll / e_corpus.n_sentences()
Esempio n. 3
0
def map_decoder(e_corpus: Corpus, f_corpus: Corpus, model: JointModel,
                callback):
    """

    :param e_corpus: English data
    :param f_corpus: French data
    :param model: components
    :param callback: called for each sentence in the parallel corpus
        callable(s, z, a, p(z|f,e), p(a|z,f,e))
    """

    n_clusters = model.PZ.n_clusters
    ll = 0.0
    # E-step
    for s, (e_snt, f_snt) in enumerate(
            zip(e_corpus.itersentences(), f_corpus.itersentences())):

        log_pz_fe, log_post_a = log_posterior(e_snt, f_snt, model)

        # Here we get the best path for each cluster
        best_paths_z = log_post_a.argmax(2)  # shape: (n_clusters, m)

        # Now we find out which path is the best one across clusters
        best_z = 0
        best_log_prob = -np.inf
        for z in range(n_clusters):
            # p(z,a|f,e) = p(z|f,e) p(a|z,f,e)
            path_log_prob = log_pz_fe[z] + np.sum(
                [log_post_a[z, j, i] for j, i in enumerate(best_paths_z[z])])
            if path_log_prob > best_log_prob:  # update if better
                best_log_prob = path_log_prob
                best_z = z

        # best posterior probabilities: p(a|z,fe)
        best_log_pa_zfe = np.array(
            [log_post_a[z, j, i] for j, i in enumerate(best_paths_z[z])])

        # communicate the finding
        callback(s, best_z, best_paths_z[best_z], np.exp(log_pz_fe[best_z]),
                 np.exp(best_log_pa_zfe))
Esempio n. 4
0
def map_decoder(e_corpus: Corpus, f_corpus: Corpus, model: ConditionalModel, callback):
    """

    :param e_corpus: English data
    :param f_corpus: French data
    :param model: components
    :param callback: called for each sentence in the parallel corpus
        callable(s, z, a, p(z|f,e), p(a|z,f,e))
    """

    # E-step
    for s, (e_snt, f_snt) in enumerate(zip(e_corpus.itersentences(), f_corpus.itersentences())):

        log_post_a = log_posterior(e_snt, f_snt, model)

        # Here we get the best path for each cluster
        best_path = log_post_a.argmax(1)  # shape: (m)

        # best posterior probabilities: p(a|z,fe)
        best_posterior = np.array([log_post_a[j, i] for j, i in enumerate(best_path)])

        # communicate the finding
        callback(s, best_path, np.exp(best_posterior))
Esempio n. 5
0
def EM(e_corpus: Corpus, f_corpus: Corpus, model: ConditionalModel, iterations=5):
    """
    Generative story:

        l ~ P(L)
        m ~ P(M)
        a_j ~ P(A_j | l) for j=1..m
        f_j ~ P(F_j | e_{a_j}) for j=1..m

    :param e_corpus: English data
    :param f_corpus: French data
    :param model: a conditional model
    :param iterations: EM iterations
    """

    PL, PM, PAj, PFj = model.components

    logging.info('Iteration %d Likelihood %f', 0, marginal_likelihood(e_corpus, f_corpus, model))

    for iteration in range(1, iterations + 1):
        # E-step
        for s, (e_snt, f_snt) in enumerate(zip(e_corpus.itersentences(), f_corpus.itersentences())):
            # get the posterior P(a|f,e)
            post_a = posterior(e_snt, f_snt, model)
            l = e_snt.shape[0]
            m = f_snt.shape[0]

            # gather expected counts for (f_j, e_j): p(a_j=i|f,e)
            for j, f in enumerate(f_snt):
                for i, e in enumerate(e_snt):
                    PAj.observe((j, i), e_snt, 0, l, m, post_a[j, i])
                    PFj.observe((j, f), (j, i), e_snt, 0, l, m, post_a[j, i])

        # M-step
        model.update()

        logging.info('Iteration %d Likelihood %f', iteration, marginal_likelihood(e_corpus, f_corpus, model))
Esempio n. 6
0
def EM(e_corpus: Corpus, f_corpus: Corpus, model: JointModel, iterations=5):
    """
    Generative story:

        l ~ P(L)
        m ~ P(M)
        z ~ P(Z)
        e_i ~ P(E_i | z) for i=1..l
        a_j ~ P(A_j | l) for j=1..m
        f_j ~ P(F_j | e_{a_j}, z) for j=1..m

    Joint distribution:
        P(F,E,A,Z,L,M) = P(L)P(M)P(Z)P(E|Z)P(A|L,M)P(F|E,A,Z,L,M)

    We make the following independence assumptions:

        P(e|z) = prod_i P(e_i|z)
        P(f|e,a,z,l,m) = prod_j P(a_j|l,m)P(f_j|e_{a_j},z)

    The EM algorithm depends on 2 posterior computations:

        [1] P(z|f,e,l,m) = P(z)P(e|z)P(f|e,z)/P(f,e)
        where
            P(e|z) = \prod_i P(e_i|z)
            P(f|e,z) = \sum_a P(f,a|e,z) = \prod_j \sum_i P(a_j=i)P(f_j|e_i,z)
            P(f,e) = \sum_z \sum_a P(f,e,z,a|l,m)
                = \sum_z P(z)P(e|z) P(f|e,z)
                = \sum_z P(z)P(e|z) \prod_j \sum_i P(a_j=i) P(f_j|e_i,z)
        and
        [2] P(a|f,e,z) = P(a,z,f,e,l,m) / P(f,e,z,l,m)
            =    P(z)(e|z)P(a|l,m)P(f|e,a,z)
              ----------------------------------
              \sum_a P(z)(e|z)P(a|l,m)P(f|e,a,z)
            =    P(z)(e|z)P(a|l,m)P(f|e,a,z)
              ----------------------------------
              P(z)(e|z)\sum_a P(a|l,m)P(f|e,a,z)
            =   P(z)(e|z)\prod_j P(a_j|l,m)P(f_j|e_{a_j},z)
              -----------------------------------------------
              P(z)(e|z)\prod_j\sum_i P(a_j=i|l,m)P(f_j|e_i,z)
            = \prod_j     P(a_j|l,m)P(f_j|e_{a_j},z)
                       -------------------------------
                       \sum_i P(a_j=i|l,m)P(f_j|e_i,z)
            = \prod_j P(a_j|f, e, z)
        where
            P(a_j|f,e,z) =    P(a_j|l,m)P(f_j|e_{a_j},z)
                           -------------------------------
                           \sum_i P(a_j=i|l,m)P(f_j|e_i,z)

    Note that the choice of parameterisation is indenpendent of the EM algorithm in this method.
    For example,
        P(a_j|l,m) can be
            * uniform (IBM1)
            * categorical (IBM2)
        P(f_j|e_{a_j}, z) can be
            * categorical and independent of z, i.e. P(f_j|e_{a_j}, z) = P(f_j|e_{a_j})
            * categorical
            * PoE: P(f_j|e_{a_j}, z) \propto P(f_j|e_{a_j}) P(f_j|z)
            * all of the above using MLP or LR instead of categorical distributions
        we can also have P(a_j|l,m)P(f_j|e_{a_j}, z) modelled by a single LR (with MLP-induced features).

    :param e_corpus: English data
    :param f_corpus: French data
    :param model: all components
    :param iterations: EM iterations
    """

    PL, PM, PZ, PEi, PAj, PFj = model.components
    n_clusters = PZ.n_clusters

    logging.info('Iteration %d Likelihood %f', 0,
                 marginal_likelihood(e_corpus, f_corpus, model))

    for iteration in range(1, iterations + 1):
        # E-step
        for s, (e_snt, f_snt) in enumerate(
                zip(e_corpus.itersentences(), f_corpus.itersentences())):
            # get the factorised posterior: P(z|f,e) and P(a|z,f,e)
            post_z, post_a = posterior(e_snt, f_snt, model)
            l = e_snt.shape[0]
            m = f_snt.shape[0]
            for z in range(n_clusters):
                # gather expected count for z: p(z|f, e)
                PZ.observe(z, l, m, post_z[z])
                # gather expected counts for (z, e_i): p(z|f, e)
                for i, e in enumerate(e_snt):
                    PEi.observe((i, e), z, l, m, post_z[z])
                # gather expected counts for (f_j, e_j): p(a_j=i|f,e,z)
                for j, f in enumerate(f_snt):
                    for i, e in enumerate(e_snt):
                        PAj.observe((j, i), e_snt, z, l, m, post_a[z, j, i])
                        PFj.observe((j, f), (j, i), e_snt, z, l, m,
                                    post_a[z, j, i])

        # M-step
        model.update()

        logging.info('Iteration %d Likelihood %f', iteration,
                     marginal_likelihood(e_corpus, f_corpus, model))