Esempio n. 1
0
    def fit(self):
        cnt_tbl = np.zeros((self.k, self.n))

        for i, y in enumerate(self.targets):
            cnt_tbl[y, i] = 1

        eprint('initialize use gold')
        self.gmm, self.weights = gmm_update(self.features,
                                            cnt_tbl,
                                            cov_type='fix',
                                            scaling_fix_cov=0.1)
        self.link_tbl = gmm_assign(self.gmm, self.features)
        if self.use_em:
            eprint('continue training with LM-GMM')
            self.link_tbl, self.gmm, self.weights, self.xe, self.ll =\
                em_decipher(self.features, self.unigram_tbl, self.bigram_tbl,
                            self.link_tbl)
        else:
            pass

        if self.unigram_tbl is not None and self.bigram_tbl is not None:
            _, _, prb_cf = em_forward_backward(self.features, self.unigram_tbl,
                                               self.bigram_tbl, self.link_tbl)
            eprint('log likelihood of LM-GMM is {}'.format(prb_cf))
            self.ll = prb_cf
            self.xe = cross_entropy([prb_cf], [self.n])
Esempio n. 2
0
def em_restart(line,
               unigram_tbl,
               bigram_tbl,
               weighted_tbl_init_function,
               subst_tbl_init_function,
               restart=10,
               use_alternative_update=False):
    """
    EM with random restarts.
    :param weighted_tbl_init_function: A function generating weighted table
      to initialize Gaussian distribution parameters. Let k=#clusters,
      n=#observations, then the table should have k * n dims. Each value in
      cells could be any positive numbers. The rations between values indicate
      the importance of each feature for composing clusters.
    :return: the best link_tbl, gmm model, cross entropy and likelihood after
      all restarts.
    """
    best_link_tbl = None
    best_subst_tbl = None
    best_gmm = None
    best_xe = np.inf
    best_weights = None
    best_ll = None

    eprint('start training...')
    for i in range(restart + 1):
        eprint('init parameters')
        init = weighted_tbl_init_function()
        gmm, weights = gmm_update(line, init, cov_type='fix')
        link_tbl = gmm_assign(gmm, line)
        if i > 0:
            eprint('random restart --- {} restarts remaining, '
                   'best cross entropy so far is {}'.format(
                       restart - i, best_xe))
        subst_init_tbl = subst_tbl_init_function()

        if use_alternative_update:
            decipher_func = em_decipher_alternative
        else:
            decipher_func = em_decipher

        link_tbl, subst_tbl, gmm, weights, xe, ll = decipher_func(
            line, unigram_tbl, bigram_tbl, link_tbl, subst_init_tbl)
        if np.isnan(xe):  # jump over nan results
            continue
        if xe < best_xe:
            best_ll = ll
            best_xe = xe
            best_link_tbl = link_tbl
            best_subst_tbl = subst_tbl
            best_gmm = gmm
            best_weights = weights
    eprint('with {} restarts, '
           'the best cross entropy is {}, '
           'the best log likelihood is {}'.format(restart, best_xe, best_ll))
    return (best_link_tbl, best_subst_tbl, best_gmm, best_weights, best_xe,
            best_ll)
Esempio n. 3
0
def em_iter_update(cnt_tbl, line):
    """
    given count, update parameters
    """
    # only ration between pdfs matters, so we use column normalization
    normalized_tbl = cnt_tbl - logsumexp(cnt_tbl, axis=0)[np.newaxis, :]
    weighted_tbl = np.exp(normalized_tbl)
    gmm, weights = gmm_update(line, weighted_tbl, cov_type='fix')
    link_tbl = gmm_assign(gmm, line)
    return link_tbl, gmm, weights
Esempio n. 4
0
def em_iter_update(cnt_tbl, line):
    """
    given count, update parameters
    """
    normalized_tbl = cnt_tbl - logsumexp(cnt_tbl, axis=0)[np.newaxis, :]
    weighted_tbl = np.exp(normalized_tbl)
    gmm, weights = gmm_update(line,
                              weighted_tbl,
                              cov_type='fix',
                              scaling_fix_cov=0.1)
    link_tbl = gmm_assign(gmm, line)
    return link_tbl, gmm, weights
Esempio n. 5
0
def em_iter_update(cnt_tbl, line):
    """
    given count, update parameters
    """
    # only ratio between pdfs is useful
    # that's why we use column normalization here
    normalized_tbl = cnt_tbl - logsumexp(cnt_tbl, axis=0)[np.newaxis, :]
    weighted_tbl = np.exp(normalized_tbl)
    gmm, weights = gmm_update(line,
                              weighted_tbl,
                              cov_type='fix',
                              scaling_fix_cov=0.1)
    link_tbl = gmm_assign(gmm, line)
    return link_tbl, gmm, weights
Esempio n. 6
0
def em_iter_update(cnt_tbl, line, cov_type='fix', scaling_factor=0.1):
    """
    given count, update parameters
    """
    # TODO: verify here with GMM implementation in SKlearn.
    # TODO: SKlearn should has column normalization.
    # TODO: But no-column normalization should be right.
    normalized_tbl = cnt_tbl - logsumexp(cnt_tbl, axis=0)[np.newaxis, :]
    weighted_tbl = np.exp(normalized_tbl)
    # weighted_tbl = np.exp(cnt_tbl)
    gmm, weights = gmm_update(line,
                              weighted_tbl,
                              cov_type=cov_type,
                              scaling_fix_cov=scaling_factor)
    link_tbl = gmm_assign(gmm, line)
    return link_tbl, gmm, weights
Esempio n. 7
0
def em_gmm_restart(line,
                   link_tbl_init_function,
                   restart=10,
                   cov_type='fix',
                   scaling_factor=0.1):
    """
    EM with random restarts.
    :return: the best link_tbl, gmm model and cross entropy after all restarts.
    """
    best_link_tbl = None
    best_gmm = None
    best_xe = np.inf
    best_weights = None

    eprint('start training...')
    for i in range(restart + 1):
        eprint('init parameters')
        gmm, weights = gmm_update(line,
                                  link_tbl_init_function(),
                                  cov_type=cov_type,
                                  scaling_fix_cov=scaling_factor)
        link_tbl = gmm_assign(gmm, line)
        if i > 0:
            eprint('random restart --- {} restarts remaining, '
                   'best cross entropy so far is {}'.format(
                       restart - i, best_xe))
        link_tbl, gmm, weights, xe = em_gmm(line,
                                            link_tbl,
                                            weights,
                                            cov_type=cov_type,
                                            scaling_factor=scaling_factor)
        if xe < best_xe:
            best_xe = xe
            best_link_tbl = link_tbl
            best_gmm = gmm
            best_weights = weights
    eprint('with {} restarts, '
           'the best cross entropy is {}'.format(restart, best_xe))
    return best_link_tbl, best_gmm, best_weights, best_xe
Esempio n. 8
0
def estimate_gmm(feature_bins,
                 feature_zs,
                 col_sizes,
                 k,
                 ignore=-1,
                 cov_type='spherical',
                 scaling_factor=1.0):
    n_row = len(feature_bins)
    features = []
    weighted_tbl = []
    for row in range(n_row):
        if row != ignore:
            w_tbl = np.zeros((k, col_sizes[row]))
            features += list(feature_bins[row, :col_sizes[row]])
            w_tbl[feature_zs[row, :col_sizes[row]],
                  range(col_sizes[row])] = 1.0
            weighted_tbl.append(w_tbl)
    features = np.asarray(features)
    weighted_tbl = np.concatenate(weighted_tbl, axis=1)
    gmm, weights = gmm_update(features,
                              weighted_tbl,
                              cov_type=cov_type,
                              scaling_fix_cov=scaling_factor)
    return gmm, np.log(weights)