Exemple #1
0
def trw(node_weights, edges, edge_weights, y,
        max_iter=100, verbose=0, tol=1e-3,
        relaxed=False):

    result = decompose_grid_graph([(node_weights, edges, edge_weights)])
    contains_node, chains, edge_index = result[0][0], result[1][0], result[2][0]

    n_nodes, n_states = node_weights.shape

    y_hat = []
    lambdas = []
    multiplier = []

    for p in xrange(n_nodes):
        multiplier.append(1.0 / len(contains_node[p]))
    for chain in chains:
        lambdas.append(np.zeros((len(chain), n_states)))
        y_hat.append(np.zeros(len(chain)))

    multiplier = np.array(multiplier)
    multiplier.shape = (n_nodes, 1)

    mu = np.zeros((n_nodes, n_states))

    learning_rate = 0.1
    energy_history = []

    for iteration in xrange(max_iter):
        E = 0
        dmu = np.zeros((n_nodes, n_states))
        unaries = node_weights - mu

        y_hat_gco, energy = inference_gco(unaries, edge_weights, edges,
                                          n_iter=5, return_energy=True)
        E -= energy

        y_hat_kappa, energy = optimize_kappa(y, mu, 1, n_nodes, n_states)
        E += energy

        dmu[np.ogrid[:dmu.shape[0]], y_hat_gco] -= 1
        dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1

        mu -= learning_rate * dmu

        energy_history.append(E)

        if iteration:
            learning_rate = 1. / np.sqrt(iteration)

        if verbose:
            print 'Iteration {}: energy {}'.format(iteration, E)

        if iteration and np.abs(E - energy_history[-2]) < tol:
            if verbose:
                print 'Converged'
            break

    return y_hat_gco, y_hat_kappa, energy_history, iteration
Exemple #2
0
def trw_lbfgs(node_weights, edges, edge_weights,
              max_iter=100, verbose=1, tol=1e-3):

    result = decompose_grid_graph([(node_weights, edges, edge_weights)], get_sign=True)
    contains_node, chains, edge_index, sign = result[0][0], result[1][0], result[2][0], result[3][0]

    n_nodes, n_states = node_weights.shape

    y_hat = []
    lambdas = np.zeros((n_nodes, n_states))
    multiplier = []

    for p in xrange(n_nodes):
        multiplier.append(1.0 / len(contains_node[p]))
        assert len(contains_node[p]) == 2
    for chain in chains:
        y_hat.append(np.zeros(len(chain)))

    multiplier = np.array(multiplier)
    multiplier.shape = (n_nodes, 1)

    history = []
    x, f_val, d = fmin_l_bfgs_b(f, np.zeros((n_nodes, n_states)),
                                args=(node_weights, multiplier, chains, edge_weights, edge_index, sign, y_hat, contains_node, history),
                                maxiter=max_iter,
                                disp=verbose,
                                pgtol=tol)

    lambdas = x.reshape((400, 10))
    unaries = node_weights * multiplier
    for i, chain in enumerate(chains):
        y_hat[i], e = optimize_chain(chain,
                                     sign[i] * lambdas[chain,:] + unaries[chain,:],
                                     edge_weights,
                                     edge_index)


    lambda_sum = np.zeros((n_nodes, n_states), dtype=np.float64)
    for p in xrange(n_nodes):
        for i in contains_node[p]:
            pos = np.where(chains[i] == p)[0][0]
            lambda_sum[p, y_hat[i][pos]] += multiplier[p]

    info = {}
    info['x'] = x
    info['f'] = f_val
    info['d'] = d
    info['history'] = history

    return lambda_sum, info
Exemple #3
0
def trw(node_weights, edges, edge_weights, y,
        max_iter=100, verbose=0, tol=1e-3):

    result = decompose_grid_graph([(node_weights, edges, edge_weights)])
    contains_node, chains, edge_index = result[0][0], result[1][0], result[2][0]

    n_nodes, n_states = node_weights.shape

    y_hat = []
    lambdas = []
    multiplier = []

    for p in xrange(n_nodes):
        multiplier.append(1.0 / (len(contains_node[p]) + 1))
    for chain in chains:
        lambdas.append(np.zeros((len(chain), n_states)))
        y_hat.append(np.zeros(len(chain)))

    multiplier = np.array(multiplier)
    multiplier.shape = (n_nodes, 1)

    mu = np.zeros((n_nodes, n_states))

    learning_rate = 0.1
    energy_history = []

    for iteration in xrange(max_iter):
        E = 0
        unaries = node_weights.copy()
        for label in xrange(n_states):
            if label not in y.weak:
                unaries[:,label] += y.weights
        unaries *= multiplier

        for i, chain in enumerate(chains):
            y_hat[i], energy = optimize_chain(chain,
                                              lambdas[i] + unaries[chain,:],
                                              edge_weights,
                                              edge_index)

            E += energy

        y_hat_kappa, energy = optimize_kappa(y, mu + unaries, 1, n_nodes, n_states, augment=False)
        E += energy

        lambda_sum = np.zeros((n_nodes, n_states), dtype=np.float64)
        for p in xrange(n_nodes):
            assert len(contains_node[p]) == 2
            for i in contains_node[p]:
                pos = np.where(chains[i] == p)[0][0]
                lambda_sum[p, y_hat[i][pos]] += multiplier[p]

        lambda_sum[np.ogrid[:n_nodes], y_hat_kappa] += multiplier.flatten()

        for i in xrange(len(chains)):
            N = lambdas[i].shape[0]

            lambdas[i][np.ogrid[:N], y_hat[i]] -= learning_rate
            lambdas[i] += learning_rate * lambda_sum[chains[i],:]

        mu[np.ogrid[:n_nodes], y_hat_kappa] -= learning_rate
        mu += learning_rate * lambda_sum

        test_l = np.zeros((n_nodes, n_states))
        for p in xrange(n_nodes):
            for i in contains_node[p]:
                pos = np.where(chains[i] == p)[0][0]
                test_l[p, :] += lambdas[i][pos,:]
        test_l += mu

        assert np.sum(test_l) < 1e-10

        energy_history.append(E)

        if iteration:
            learning_rate = 1. / np.sqrt(iteration)

        if verbose:
            print 'Iteration {}: energy {}'.format(iteration, E)

        if iteration > 300 and np.abs(E - energy_history[-2]) < tol:
            if verbose:
                print 'Converged'
            break

    return lambda_sum, y_hat_kappa, energy_history, iteration
Exemple #4
0
def trw(node_weights, edges, edge_weights, y,
        max_iter=100, verbose=0, tol=1e-3,
        update_mu=50, get_energy=None):

    result = decompose_grid_graph([(node_weights, edges, edge_weights)])
    contains_node, chains, edge_index = result[0][0], result[1][0], result[2][0]

    n_nodes, n_states = node_weights.shape

    y_hat = []
    lambdas = []
    multiplier = []

    for p in xrange(n_nodes):
        multiplier.append(1.0 / len(contains_node[p]))
    for chain in chains:
        lambdas.append(np.zeros((len(chain), n_states)))
        y_hat.append(np.zeros(len(chain)))

    multiplier = np.array(multiplier)
    multiplier.shape = (n_nodes, 1)

    mu = np.zeros((n_nodes, n_states))

    learning_rate = 0.1
    energy_history = []
    primal_history = []

    for iteration in xrange(max_iter):
        dmu = np.zeros((n_nodes, n_states))
        unaries = (node_weights - mu) * multiplier

        inner_energy = []
        for inner in xrange(update_mu):
            E = 0
            for i, chain in enumerate(chains):
                y_hat[i], energy = optimize_chain(chain,
                                                  lambdas[i] + unaries[chain,:],
                                                  edge_weights,
                                                  edge_index)

                E += energy

            inner_energy.append(E)

            lambda_sum = np.zeros((n_nodes, n_states), dtype=np.float64)
            for p in xrange(n_nodes):
                for i in contains_node[p]:
                    pos = np.where(chains[i] == p)[0][0]
                    lambda_sum[p, y_hat[i][pos]] += multiplier[p]

            for i in xrange(len(chains)):
                N = lambdas[i].shape[0]

                lambdas[i][np.ogrid[:N], y_hat[i]] -= learning_rate
                lambdas[i] += learning_rate * lambda_sum[chains[i],:]

            if inner > 0 and np.abs(inner_energy[-2] - E) < 1e-2:
                break

        E = inner_energy[-1]

        y_hat_kappa, energy = optimize_kappa(y, mu, 1, n_nodes, n_states)
        E += energy

        for i in xrange(len(chains)):
            dmu[chains[i], y_hat[i]] -= multiplier[chains[i]].flatten()
        dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1

        mu -= learning_rate * dmu

        energy_history.append(E)

        if get_energy is not None:
            primal = get_energy(get_labelling(lambda_sum))
            primal_history.append(primal)

        if iteration:
            learning_rate = 1. / np.sqrt(iteration)

        if verbose:
            print 'Iteration {}: inner={} energy={}'.format(iteration, inner, E)

        if iteration > 0 and np.abs(E - energy_history[-2]) < tol:
            if verbose:
                print 'Converged'
            break

    info = {'primal': primal_history,
            'dual': energy_history,
            'iteration': iteration}

    return lambda_sum, y_hat_kappa, info
Exemple #5
0
    def fit(self,
            X,
            Y,
            train_scorer,
            test_scorer,
            decompose='general',
            use_latent_first_iter=500,
            undergenerating_weak=True,
            smd=False):
        self.logger.info('Initialization')

        if decompose == 'general':
            contains_node, chains, edge_index = decompose_graph(X)
        elif decompose == 'grid':
            contains_node, chains, edge_index = decompose_grid_graph(X)
        else:
            raise ValueError

        y_hat = []
        lambdas = []
        multiplier = []
        xx = []
        mu = {}
        for k in xrange(len(X)):
            x, y = X[k], Y[k]
            n_nodes = x[0].shape[0]
            xx.append(np.zeros(n_nodes))
            _lambdas = []
            _y_hat = []
            _multiplier = []
            for p in xrange(n_nodes):
                _multiplier.append(1.0 / len(contains_node[k][p]))
            for chain in chains[k]:
                _lambdas.append(np.zeros((len(chain), self.n_states)))
                _y_hat.append(np.zeros(len(chain), dtype=np.int32))
            lambdas.append(_lambdas)
            y_hat.append(_y_hat)
            _multiplier = np.array(_multiplier)
            _multiplier.shape = (n_nodes, 1)
            multiplier.append(_multiplier)
            if not y.full_labeled:
                mu[k] = np.zeros((n_nodes, self.n_states))

        w = np.zeros(self.size_w)
        self.w = w.copy()

        self.start_time = time.time()
        self.timestamps = [0]
        self.objective_curve = []
        self.train_score = []
        self.test_score = []
        self.w_history = []

        learning_rate1 = 0.1
        learning_rate2 = 0.1

        for iteration in xrange(self.max_iter):
            self.logger.info('Iteration %d', iteration)
            self.logger.info('Optimize slave MRF and update w')

            objective = 0
            dw = np.zeros(w.shape)

            for k in xrange(len(X)):
                x, y = X[k], Y[k]
                n_nodes = x[0].shape[0]

                #                self.logger.info('object %d', k)

                if y.full_labeled:
                    unaries = self._loss_augment_unaries(
                        self._get_unary_potentials(x, w), y.full, y.weights)
                    unaries *= multiplier[k]
                    pairwise = self._get_pairwise_potentials(x, w)

                    jf = self._joint_features_full(x, y.full)
                    objective -= np.dot(w, jf)
                    dw -= jf

                    for i in xrange(len(chains[k])):
                        y_hat[k][i], energy = optimize_chain(
                            chains[k][i],
                            lambdas[k][i] + unaries[chains[k][i], :], pairwise,
                            edge_index[k])

                        dw += self._joint_features(chains[k][i], x,
                                                   y_hat[k][i], edge_index[k],
                                                   multiplier[k])

                        objective += energy
                elif iteration > use_latent_first_iter:
                    if undergenerating_weak:
                        # Use gco for full K oracle
                        #                        y_hat_, energy = self.loss_augmented_inference(x, y, w)
                        #                        jf_gt = self._joint_features_full(x, y.full)
                        #                        objective -= np.dot(w, jf_gt)
                        #                        objective += energy
                        #                        dw -= jf_gt
                        #                        dw += self._joint_features_full(x, y_hat_)

                        # use gco for first summand in DD
                        for mm in xrange(10):
                            dmu = np.zeros((n_nodes, self.n_states))

                            unaries = self._get_unary_potentials(x, w) - mu[k]
                            pairwise = self._get_pairwise_potentials(x, w)

                            y_hat_gco, energy = inference_gco(
                                unaries,
                                pairwise,
                                self._get_edges(x),
                                n_iter=5,
                                return_energy=True)
                            objective -= energy
                            dmu[np.ogrid[:dmu.shape[0]], y_hat_gco] -= 1
                            dw += self._joint_features_full(x, y_hat_gco)

                            jf = self._joint_features_full(x, y.full)
                            objective -= np.dot(w, jf)
                            dw -= jf

                            y_hat_kappa, energy = optimize_kappa(
                                y, mu[k], self.alpha, n_nodes, self.n_states)
                            objective += energy
                            dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1

                            mu[k] -= learning_rate2 * dmu
                    elif not smd:
                        dmu = np.zeros((n_nodes, self.n_states))

                        unaries = (self._get_unary_potentials(x, w) -
                                   mu[k]) * multiplier[k]
                        pairwise = self._get_pairwise_potentials(x, w)

                        jf = self._joint_features_full(x, y.full)
                        objective -= np.dot(w, jf)
                        dw -= jf

                        #begin inner (can remove this to restore to previous state)
                        E = 0
                        Eprev = -100
                        for j in xrange(self.update_mu):
                            E = 0
                            for i in xrange(len(chains[k])):
                                y_hat[k][i], energy = optimize_chain(
                                    chains[k][i],
                                    lambdas[k][i] + unaries[chains[k][i], :],
                                    pairwise, edge_index[k])
                                E += energy

                            lambda_sum = np.zeros((n_nodes, self.n_states),
                                                  dtype=np.float64)

                            for p in xrange(n_nodes):
                                for i in contains_node[k][p]:
                                    pos = np.where(chains[k][i] == p)[0][0]
                                    lambda_sum[
                                        p,
                                        y_hat[k][i][pos]] += multiplier[k][p]

                            for i in xrange(len(chains[k])):
                                N = lambdas[k][i].shape[0]

                                lambdas[k][i][np.ogrid[:N],
                                              y_hat[k][i]] -= learning_rate2
                                lambdas[k][i] += learning_rate2 * lambda_sum[
                                    chains[k][i], :]

                            if np.abs(E - Eprev) < 0.1:
                                break
                            Eprev = E
#end inner

#last one
                        for i in xrange(len(chains[k])):
                            y_hat[k][i], energy = optimize_chain(
                                chains[k][i],
                                lambdas[k][i] + unaries[chains[k][i], :],
                                pairwise, edge_index[k])

                            dw += self._joint_features(chains[k][i], x,
                                                       y_hat[k][i],
                                                       edge_index[k],
                                                       multiplier[k])

                            objective += energy

                            dmu[chains[k][i], y_hat[k][i]] -= multiplier[k][
                                chains[k][i]].flatten()
#

                        y_hat_kappa, energy = optimize_kappa(
                            y, mu[k], self.alpha, n_nodes, self.n_states)

                        objective += energy
                        dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1

                        mu[k] -= learning_rate2 * dmu
                    elif smd:
                        if iteration > 1500:
                            mMu = 10
                        else:
                            mMu = 1
                        for mm in xrange(mMu):
                            dmu = np.zeros((n_nodes, self.n_states))

                            jf = self._joint_features_full(x, y.full)
                            objective -= np.dot(w, jf)
                            dw -= jf

                            unaries = -self._get_unary_potentials(x, w) + mu[k]
                            edge_weights = -self._get_pairwise_potentials(x, w)
                            edges = self._get_edges(x)

                            n_edges = edges.shape[0]
                            y_hat2 = []
                            pairwise = []
                            for j in xrange(self.n_states):
                                y_hat2.append(np.zeros(self.n_states))
                                _pairwise = np.zeros((n_edges, 2, 2))
                                for i in xrange(n_edges):
                                    _pairwise[i, 1, 0] = _pairwise[
                                        i, 0, 1] = -0.5 * edge_weights[i, j, j]
                                pairwise.append(_pairwise)

                            for i in xrange(n_edges):
                                e1, e2 = edges[i]
                                unaries[e1, :] += 0.5 * np.diag(
                                    edge_weights[i, :, :])
                                unaries[e2, :] += 0.5 * np.diag(
                                    edge_weights[i, :, :])

                            xx[k], f_val, d = fmin_l_bfgs_b(f,
                                                            xx[k],
                                                            args=(unaries,
                                                                  pairwise,
                                                                  edges),
                                                            maxiter=50,
                                                            maxfun=50,
                                                            pgtol=1e-2)

                            E = np.sum(xx[k])
                            for j in xrange(self.n_states):
                                new_unaries = np.zeros((n_nodes, 2))
                                new_unaries[:, 1] = unaries[:, j] + xx[k]
                                y_hat2[j], energy = binary_general_graph(
                                    edges, new_unaries, pairwise[j])
                                E -= 0.5 * energy
                                dmu[:, j] -= y_hat2[j]

                                dw += self._joint_features_full(
                                    x, y_hat2[j] * j)

                            y_hat_kappa, energy = optimize_kappa(
                                y, mu[k], 1, n_nodes, self.n_states)
                            E += energy
                            dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1
                            objective += E

                            mu[k] -= learning_rate2 * dmu

            dw += w / self.C

            if iteration < 100 or iteration % self.update_w_every == 0:
                w -= learning_rate1 * dw
            objective = self.C * objective + np.sum(w**2) / 2

            self.logger.info('Update lambda')

            for k in xrange(len(X)):
                if undergenerating_weak and not Y[k].full_labeled:
                    continue
                if smd and not Y[k].full_labeled:
                    continue

                n_nodes = X[k][0].shape[0]
                lambda_sum = np.zeros((n_nodes, self.n_states),
                                      dtype=np.float64)

                for p in xrange(n_nodes):
                    for i in contains_node[k][p]:
                        pos = np.where(chains[k][i] == p)[0][0]
                        lambda_sum[p, y_hat[k][i][pos]] += multiplier[k][p]

                for i in xrange(len(chains[k])):
                    N = lambdas[k][i].shape[0]

                    lambdas[k][i][np.ogrid[:N], y_hat[k][i]] -= learning_rate2
                    lambdas[k][i] += learning_rate2 * lambda_sum[
                        chains[k][i], :]

            if iteration % self.complete_every == 0 or iteration in [
                    51, 80, 101, 130
            ]:
                self.logger.info('Complete latent variables')
                Y_new = Parallel(n_jobs=self.n_jobs, verbose=0,
                                 max_nbytes=1e8)(
                                     delayed(latent)(self.model, x, y, w)
                                     for x, y in zip(X, Y))
                changes = np.sum([
                    np.any(y_new.full != y.full) for y_new, y in zip(Y_new, Y)
                ])
                self.logger.info('changes in latent variables: %d', changes)
                Y = Y_new

            if iteration and (iteration % self.check_every == 0):
                self.logger.info('Compute train and test scores')
                self.train_score.append(train_scorer(w))
                self.logger.info('Train SCORE: %f', self.train_score[-1])
                self.test_score.append(test_scorer(w))
                self.logger.info('Test SCORE: %f', self.test_score[-1])

            self.logger.info('diff: %f', np.sum((w - self.w)**2))
            if iteration:
                learning_rate1 = 1.0 / iteration
                learning_rate2 = 1.0 / iteration

            self.timestamps.append(time.time() - self.start_time)
            self.objective_curve.append(objective)

            self.logger.info('Objective: %f', objective)

            self.w = w.copy()
            self.w_history.append(self.w)

        self.w = w

        self.timestamps = np.array(self.timestamps)
        self.objective_curve = np.array(self.objective_curve)
        self.train_score = np.array(self.train_score)
        self.test_score = np.array(self.test_score)
        self.w_history = np.vstack(self.w_history)
Exemple #6
0
def trw(node_weights, edges, edge_weights,
        max_iter=100, verbose=0, tol=1e-3,
        strategy='sqrt',
        r0=1.5, r1=0.5, gamma=0.1):

    assert strategy in ['best-dual', 'best-primal', 'sqrt', 'linear']

    result = decompose_grid_graph([(node_weights, edges, edge_weights)])
    contains_node, chains, edge_index = result[0][0], result[1][0], result[2][0]

    n_nodes, n_states = node_weights.shape

    y_hat = []
    lambdas = []
    multiplier = []

    for p in xrange(n_nodes):
        multiplier.append(1.0 / len(contains_node[p]))
    for chain in chains:
        lambdas.append(np.zeros((len(chain), n_states)))
        y_hat.append(np.zeros(len(chain)))

    multiplier = np.array(multiplier)
    multiplier.shape = (n_nodes, 1)

    delta = 1.
    learning_rate = 0.1
    dual_history = []
    primal_history = []

    best_dual = np.inf
    best_primal = -np.inf

    for iteration in xrange(max_iter):
        dual = 0.0
        unaries = node_weights * multiplier

        for i, chain in enumerate(chains):
            y_hat[i], e = optimize_chain(chain,
                                         lambdas[i] + unaries[chain,:],
                                         edge_weights,
                                         edge_index)

            dual += e

        lambda_sum = np.zeros((n_nodes, n_states), dtype=np.float64)
        for p in xrange(n_nodes):
            for i in contains_node[p]:
                pos = np.where(chains[i] == p)[0][0]
                lambda_sum[p, y_hat[i][pos]] += multiplier[p]

        p_norm = 0.0
        for i in xrange(len(chains)):
            N = lambdas[i].shape[0]

            dlambda = lambda_sum[chains[i],:].copy()
            dlambda[np.ogrid[:N], y_hat[i]] -= 1

            p_norm += np.sum(dlambda ** 2)

            lambdas[i] += learning_rate * dlambda

        primal = compute_energy(get_labelling(lambda_sum), unaries, edge_weights, edges)
        primal_history.append(primal)
        dual_history.append(dual)

        if iteration and (np.abs(dual - dual_history[-2]) < tol or p_norm < tol):
            if verbose:
                print 'Converged'
            break

        if iteration:
            if strategy == 'sqrt':
                learning_rate = 1. / np.sqrt(iteration)
            elif strategy == 'linear':
                learning_rate = 1. / iteration
            elif strategy == 'best-dual':
                best_dual = min(best_dual, dual)
                approx = best_dual - delta
                if dual <= dual_history[-2]:
                    delta *= r0
                else:
                    delta = max(r1 * delta, 1e-4)
                learning_rate = gamma * (dual - approx) / p_norm
            elif strategy == 'best-primal':
                best_primal = max(best_primal, primal)
                learning_rate = gamma * (dual - best_primal) / p_norm


        if verbose:
            print 'iteration {}: dual energy = {}'.format(iteration, dual)

    info = {}
    info['dual_energy'] = dual_history
    info['primal_energy'] = primal_history

    return lambda_sum, info
Exemple #7
0
def trw(node_weights,
        edges,
        edge_weights,
        y,
        max_iter=100,
        verbose=0,
        tol=1e-3,
        update_mu=50,
        get_energy=None):

    result = decompose_grid_graph([(node_weights, edges, edge_weights)])
    contains_node, chains, edge_index = result[0][0], result[1][0], result[2][
        0]

    n_nodes, n_states = node_weights.shape

    y_hat = []
    lambdas = []
    multiplier = []

    for p in xrange(n_nodes):
        multiplier.append(1.0 / len(contains_node[p]))
    for chain in chains:
        lambdas.append(np.zeros((len(chain), n_states)))
        y_hat.append(np.zeros(len(chain)))

    multiplier = np.array(multiplier)
    multiplier.shape = (n_nodes, 1)

    mu = np.zeros((n_nodes, n_states))

    learning_rate = 0.1
    energy_history = []
    primal_history = []

    for iteration in xrange(max_iter):
        dmu = np.zeros((n_nodes, n_states))
        unaries = (node_weights - mu) * multiplier

        inner_energy = []
        for inner in xrange(update_mu):
            E = 0
            for i, chain in enumerate(chains):
                y_hat[i], energy = optimize_chain(
                    chain, lambdas[i] + unaries[chain, :], edge_weights,
                    edge_index)

                E += energy

            inner_energy.append(E)

            lambda_sum = np.zeros((n_nodes, n_states), dtype=np.float64)
            for p in xrange(n_nodes):
                for i in contains_node[p]:
                    pos = np.where(chains[i] == p)[0][0]
                    lambda_sum[p, y_hat[i][pos]] += multiplier[p]

            for i in xrange(len(chains)):
                N = lambdas[i].shape[0]

                lambdas[i][np.ogrid[:N], y_hat[i]] -= learning_rate
                lambdas[i] += learning_rate * lambda_sum[chains[i], :]

            if inner > 0 and np.abs(inner_energy[-2] - E) < 1e-2:
                break

        E = inner_energy[-1]

        y_hat_kappa, energy = optimize_kappa(y, mu, 1, n_nodes, n_states)
        E += energy

        for i in xrange(len(chains)):
            dmu[chains[i], y_hat[i]] -= multiplier[chains[i]].flatten()
        dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1

        mu -= learning_rate * dmu

        energy_history.append(E)

        if get_energy is not None:
            primal = get_energy(get_labelling(lambda_sum))
            primal_history.append(primal)

        if iteration:
            learning_rate = 1. / np.sqrt(iteration)

        if verbose:
            print 'Iteration {}: inner={} energy={}'.format(
                iteration, inner, E)

        if iteration > 0 and np.abs(E - energy_history[-2]) < tol:
            if verbose:
                print 'Converged'
            break

    info = {
        'primal': primal_history,
        'dual': energy_history,
        'iteration': iteration
    }

    return lambda_sum, y_hat_kappa, info
Exemple #8
0
    def fit(self, X, Y, train_scorer, test_scorer, decompose='grid', w0=None):
        print('over unconstr begin')

        if decompose == 'general':
            contains_node, chains, edge_index = decompose_graph(X)
        elif decompose == 'grid':
            contains_node, chains, edge_index, sign = decompose_grid_graph(
                X, get_sign=True)
        else:
            raise ValueError

        y_hat = []
        lambdas = []
        multiplier = []
        for k in xrange(len(X)):
            n_nodes = X[k][0].shape[0]
            _y_hat = []
            _multiplier = []
            for p in xrange(n_nodes):
                _multiplier.append(1.0 / len(contains_node[k][p]))
            for chain in chains[k]:
                _y_hat.append(np.zeros(len(chain)))
            lambdas.append(np.zeros((n_nodes, self.n_states)))
            y_hat.append(_y_hat)
            _multiplier = np.array(_multiplier)
            _multiplier.shape = (n_nodes, 1)
            multiplier.append(_multiplier)

        w = np.zeros(self.size_w)
        self.w = w.copy()

        self.start_time = time.time()
        self.timestamps = [0]
        self.objective_curve = []

        history = {
            'train_scores': [],
            'test_scores': [],
            'objective': [],
            'iteration': 0,
            'w': []
        }

        self.train_scorer = train_scorer
        self.test_scorer = test_scorer

        #x0 = np.zeros(self.size_w + 4000 * len(X))
        x0 = np.zeros(self.size_w)
        if w0 is not None:
            x0 = w0


#        l = 0.1
#        for iteration in xrange(100):
#            fval, grad = f2(w, self, X, Y, history)
#            w -= l * grad
#            if iteration:
#                l = 0.01 / iteration

        x, f_val, d = fmin_l_bfgs_b(f2,
                                    x0,
                                    args=(self, X, Y, history),
                                    maxiter=self.max_iter,
                                    disp=0,
                                    pgtol=1e-8)

        return w, history
Exemple #9
0
    def fit(self, X, Y, train_scorer, test_scorer, decompose='general'):
        self.logger.info('Initialization')

        if decompose == 'general':
            contains_node, chains, edge_index = decompose_graph(X)
        elif decompose == 'grid':
            contains_node, chains, edge_index = decompose_grid_graph(X)
        else:
            raise ValueError

        y_hat = []
        lambdas = []
        multiplier = []
        for k in xrange(len(X)):
            n_nodes = X[k][0].shape[0]
            _lambdas = []
            _y_hat = []
            _multiplier = []
            for p in xrange(n_nodes):
                _multiplier.append(1.0 / len(contains_node[k][p]))
            for chain in chains[k]:
                _lambdas.append(np.zeros((len(chain), self.n_states)))
                _y_hat.append(np.zeros(len(chain)))
            lambdas.append(_lambdas)
            y_hat.append(_y_hat)
            _multiplier = np.array(_multiplier)
            _multiplier.shape = (n_nodes, 1)
            multiplier.append(_multiplier)

        w = np.zeros(self.size_w)
        self.w = w.copy()

        self.start_time = time.time()
        self.timestamps = [0]
        self.objective_curve = []
        self.train_score = []
        self.test_score = []
        self.w_history = []

        learning_rate = 0.1

        for iteration in xrange(self.max_iter):
            self.logger.info('Iteration %d', iteration)
            self.logger.info('Optimize slave MRF and update w')

            objective = 0
            dw = np.zeros(w.shape)

            for k in xrange(len(X)):
                self.logger.info('object %d', k)
                x, y = X[k], Y[k]
                n_nodes = x[0].shape[0]

                unaries = self._loss_augment_unaries(
                    self._get_unary_potentials(x, w), y.full, y.weights)
                unaries *= multiplier[k]

                pairwise = self._get_pairwise_potentials(x, w)

                objective += np.dot(w, self._joint_features_full(x, y.full))
                dw -= self._joint_features_full(x, y.full)

                for i in xrange(len(chains[k])):
                    y_hat[k][i], energy = optimize_chain(
                        chains[k][i], lambdas[k][i] + unaries[chains[k][i], :],
                        pairwise, edge_index[k])

                    dw += self._joint_features(chains[k][i], x, y_hat[k][i],
                                               edge_index[k], multiplier[k])
                    objective -= energy

            dw -= w / self.C

            w += learning_rate * dw
            objective = self.C * objective + np.sum(w**2) / 2

            if iteration and (iteration % self.check_every == 0):
                self.logger.info('Compute train and test scores')
                self.train_score.append(train_scorer(w))
                self.logger.info('Train SCORE: %f', self.train_score[-1])
                self.test_score.append(test_scorer(w))
                self.logger.info('Test SCORE: %f', self.test_score[-1])

            self.logger.info('Update lambda')

            for k in xrange(len(X)):
                n_nodes = X[k][0].shape[0]
                lambda_sum = np.zeros((n_nodes, self.n_states),
                                      dtype=np.float64)

                for p in xrange(n_nodes):
                    for i in contains_node[k][p]:
                        pos = np.where(chains[k][i] == p)[0][0]
                        lambda_sum[p, y_hat[k][i][pos]] += multiplier[k][p]

                for i in xrange(len(chains[k])):
                    N = lambdas[k][i].shape[0]

                    lambdas[k][i][np.ogrid[:N], y_hat[k][i]] += learning_rate
                    lambdas[k][i] -= learning_rate * lambda_sum[
                        chains[k][i], :]

            self.logger.info('diff: %f', np.sum((w - self.w)**2))
            if iteration:
                learning_rate = 1.0 / iteration

            self.timestamps.append(time.time() - self.start_time)
            self.objective_curve.append(objective)

            self.logger.info('Objective: %f', objective)

            self.w = w.copy()
            self.w_history.append(self.w)

        self.w = w

        self.timestamps = np.array(self.timestamps)
        self.objective_curve = np.array(self.objective_curve)
        self.train_score = np.array(self.train_score)
        self.test_score = np.array(self.test_score)
        self.w_history = np.vstack(self.w_history)
    def fit(self, X, Y, train_scorer, test_scorer, decompose='grid'):
        print('over unconstr begin')

        if decompose == 'general':
            contains_node, chains, edge_index = decompose_graph(X)
        elif decompose == 'grid':
            contains_node, chains, edge_index, sign = decompose_grid_graph(
                X, get_sign=True)
        else:
            raise ValueError

        y_hat = []
        lambdas = []
        multiplier = []
        for k in xrange(len(X)):
            n_nodes = X[k][0].shape[0]
            _y_hat = []
            _multiplier = []
            for p in xrange(n_nodes):
                _multiplier.append(1.0 / len(contains_node[k][p]))
            for chain in chains[k]:
                _y_hat.append(np.zeros(len(chain)))
            lambdas.append(np.zeros((n_nodes, self.n_states)))
            y_hat.append(_y_hat)
            _multiplier = np.array(_multiplier)
            _multiplier.shape = (n_nodes, 1)
            multiplier.append(_multiplier)

        w = np.zeros(self.size_w)
        self.w = w.copy()

        self.start_time = time.time()
        self.timestamps = [0]
        self.objective_curve = []
        self.train_score = []
        self.test_score = []
        self.w_history = []

        learning_rate = 0.1

        for iteration in xrange(self.max_iter):
            print('Iteration %d' % iteration)

            objective = 0
            dw = np.zeros(w.shape)

            for k in xrange(len(X)):
                x, y = X[k], Y[k]
                n_nodes = x[0].shape[0]

                unaries = self._loss_augment_unaries(
                    self._get_unary_potentials(x, w), y.full, y.weights)
                unaries *= multiplier[k]

                pairwise = self._get_pairwise_potentials(x, w)

                objective -= np.dot(w, self._joint_features_full(x, y.full))
                dw -= self._joint_features_full(x, y.full)

                for i in xrange(len(chains[k])):
                    y_hat[k][i], energy = optimize_chain(
                        chains[k][i],
                        sign[k][i] * lambdas[k][chains[k][i], :] +
                        unaries[chains[k][i], :], pairwise, edge_index[k])

                    dw += self._joint_features(chains[k][i], x, y_hat[k][i],
                                               edge_index[k], multiplier[k])
                    objective += energy

            dw += w / self.C

            w -= learning_rate * dw
            objective = self.C * objective + np.sum(w**2) / 2

            if iteration and (iteration % self.check_every == 0):
                print('Compute train and test scores')
                self.train_score.append(train_scorer(w))
                print('Train SCORE: %f' % self.train_score[-1])
                self.test_score.append(test_scorer(w))
                print('Test SCORE: %f' % self.test_score[-1])

            for k in xrange(len(X)):
                n_nodes = X[k][0].shape[0]

                for p in xrange(n_nodes):
                    dlambda = np.zeros(self.n_states)
                    for i in contains_node[k][p]:
                        pos = np.where(chains[k][i] == p)[0][0]
                        dlambda[y_hat[k][i][pos]] += sign[k][i]
                    lambdas[k][p] -= learning_rate * dlambda

            if iteration:
                learning_rate = 1.0 / iteration

            self.timestamps.append(time.time() - self.start_time)
            self.objective_curve.append(objective)

            self.w = w.copy()
            self.w_history.append(self.w)

        self.w = w

        self.timestamps = np.array(self.timestamps)
        self.objective_curve = np.array(self.objective_curve)
        self.train_score = np.array(self.train_score)
        self.test_score = np.array(self.test_score)
        self.w_history = np.vstack(self.w_history)
Exemple #11
0
    def fit(self, X, Y, train_scorer, test_scorer, decompose='grid'):
        print('over unconstr begin')

        if decompose == 'general':
            contains_node, chains, edge_index = decompose_graph(X)
        elif decompose == 'grid':
            contains_node, chains, edge_index, sign = decompose_grid_graph(X, get_sign=True)
        else:
            raise ValueError

        y_hat = []
        lambdas = []
        multiplier = []
        for k in xrange(len(X)):
            n_nodes = X[k][0].shape[0]
            _y_hat = []
            _multiplier = []
            for p in xrange(n_nodes):
                _multiplier.append(1.0 / len(contains_node[k][p]))
            for chain in chains[k]:
                _y_hat.append(np.zeros(len(chain)))
            lambdas.append(np.zeros((n_nodes, self.n_states)))
            y_hat.append(_y_hat)
            _multiplier = np.array(_multiplier)
            _multiplier.shape = (n_nodes, 1)
            multiplier.append(_multiplier)

        w = np.zeros(self.size_w)
        self.w = w.copy()

        self.start_time = time.time()
        self.timestamps = [0]
        self.objective_curve = []
        self.train_score = []
        self.test_score = []
        self.w_history = []

        learning_rate = 0.1

        for iteration in xrange(self.max_iter):
            print('Iteration %d' % iteration)

            objective = 0
            dw = np.zeros(w.shape)

            for k in xrange(len(X)):
                x, y = X[k], Y[k]
                n_nodes = x[0].shape[0]

                unaries = self._loss_augment_unaries(self._get_unary_potentials(x, w), y.full, y.weights)
                unaries *= multiplier[k]

                pairwise = self._get_pairwise_potentials(x, w)

                objective -= np.dot(w, self._joint_features_full(x, y.full))
                dw -= self._joint_features_full(x, y.full)

                for i in xrange(len(chains[k])):
                    y_hat[k][i], energy = optimize_chain(chains[k][i],
                                                         sign[k][i] * lambdas[k][chains[k][i],:] + unaries[chains[k][i],:],
                                                         pairwise,
                                                         edge_index[k])

                    dw += self._joint_features(chains[k][i], x, y_hat[k][i], edge_index[k], multiplier[k])
                    objective += energy

            dw += w / self.C

            w -= learning_rate * dw
            objective = self.C * objective + np.sum(w ** 2) / 2

            if iteration and (iteration % self.check_every == 0):
                print('Compute train and test scores')
                self.train_score.append(train_scorer(w))
                print('Train SCORE: %f' % self.train_score[-1])
                self.test_score.append(test_scorer(w))
                print('Test SCORE: %f' % self.test_score[-1])

            for k in xrange(len(X)):
                n_nodes = X[k][0].shape[0]

                for p in xrange(n_nodes):
                    dlambda = np.zeros(self.n_states)
                    for i in contains_node[k][p]:
                        pos = np.where(chains[k][i] == p)[0][0]
                        dlambda[y_hat[k][i][pos]] += sign[k][i]
                    lambdas[k][p] -= learning_rate * dlambda

            if iteration:
                learning_rate = 1.0 / iteration

            self.timestamps.append(time.time() - self.start_time)
            self.objective_curve.append(objective)

            self.w = w.copy()
            self.w_history.append(self.w)
        
        self.w = w

        self.timestamps = np.array(self.timestamps)
        self.objective_curve = np.array(self.objective_curve)
        self.train_score = np.array(self.train_score)
        self.test_score = np.array(self.test_score)
        self.w_history = np.vstack(self.w_history)
Exemple #12
0
def trw(node_weights,
        edges,
        edge_weights,
        y,
        max_iter=100,
        verbose=0,
        tol=1e-3,
        relaxed=False):

    result = decompose_grid_graph([(node_weights, edges, edge_weights)])
    contains_node, chains, edge_index = result[0][0], result[1][0], result[2][
        0]

    n_nodes, n_states = node_weights.shape

    y_hat = []
    lambdas = []
    multiplier = []

    for p in xrange(n_nodes):
        multiplier.append(1.0 / len(contains_node[p]))
    for chain in chains:
        lambdas.append(np.zeros((len(chain), n_states)))
        y_hat.append(np.zeros(len(chain)))

    multiplier = np.array(multiplier)
    multiplier.shape = (n_nodes, 1)

    mu = np.zeros((n_nodes, n_states))

    learning_rate = 0.1
    energy_history = []

    for iteration in xrange(max_iter):
        E = 0
        dmu = np.zeros((n_nodes, n_states))
        unaries = node_weights - mu

        y_hat_gco, energy = inference_gco(unaries,
                                          edge_weights,
                                          edges,
                                          n_iter=5,
                                          return_energy=True)
        E -= energy

        y_hat_kappa, energy = optimize_kappa(y, mu, 1, n_nodes, n_states)
        E += energy

        dmu[np.ogrid[:dmu.shape[0]], y_hat_gco] -= 1
        dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1

        mu -= learning_rate * dmu

        energy_history.append(E)

        if iteration:
            learning_rate = 1. / np.sqrt(iteration)

        if verbose:
            print 'Iteration {}: energy {}'.format(iteration, E)

        if iteration and np.abs(E - energy_history[-2]) < tol:
            if verbose:
                print 'Converged'
            break

    return y_hat_gco, y_hat_kappa, energy_history, iteration
Exemple #13
0
def trw(node_weights, edges, edge_weights, y, max_iter=100, verbose=0, tol=1e-3):

    result = decompose_grid_graph([(node_weights, edges, edge_weights)])
    contains_node, chains, edge_index = result[0][0], result[1][0], result[2][0]

    n_nodes, n_states = node_weights.shape

    y_hat = []
    lambdas = []
    multiplier = []

    for p in xrange(n_nodes):
        multiplier.append(1.0 / (len(contains_node[p]) + 1))
    for chain in chains:
        lambdas.append(np.zeros((len(chain), n_states)))
        y_hat.append(np.zeros(len(chain)))

    multiplier = np.array(multiplier)
    multiplier.shape = (n_nodes, 1)

    mu = np.zeros((n_nodes, n_states))

    learning_rate = 0.1
    energy_history = []

    for iteration in xrange(max_iter):
        E = 0
        unaries = node_weights.copy()
        for label in xrange(n_states):
            if label not in y.weak:
                unaries[:, label] += y.weights
        unaries *= multiplier

        for i, chain in enumerate(chains):
            y_hat[i], energy = optimize_chain(chain, lambdas[i] + unaries[chain, :], edge_weights, edge_index)

            E += energy

        y_hat_kappa, energy = optimize_kappa(y, mu + unaries, 1, n_nodes, n_states, augment=False)
        E += energy

        lambda_sum = np.zeros((n_nodes, n_states), dtype=np.float64)
        for p in xrange(n_nodes):
            assert len(contains_node[p]) == 2
            for i in contains_node[p]:
                pos = np.where(chains[i] == p)[0][0]
                lambda_sum[p, y_hat[i][pos]] += multiplier[p]

        lambda_sum[np.ogrid[:n_nodes], y_hat_kappa] += multiplier.flatten()

        for i in xrange(len(chains)):
            N = lambdas[i].shape[0]

            lambdas[i][np.ogrid[:N], y_hat[i]] -= learning_rate
            lambdas[i] += learning_rate * lambda_sum[chains[i], :]

        mu[np.ogrid[:n_nodes], y_hat_kappa] -= learning_rate
        mu += learning_rate * lambda_sum

        test_l = np.zeros((n_nodes, n_states))
        for p in xrange(n_nodes):
            for i in contains_node[p]:
                pos = np.where(chains[i] == p)[0][0]
                test_l[p, :] += lambdas[i][pos, :]
        test_l += mu

        assert np.sum(test_l) < 1e-10

        energy_history.append(E)

        if iteration:
            learning_rate = 1.0 / np.sqrt(iteration)

        if verbose:
            print "Iteration {}: energy {}".format(iteration, E)

        if iteration > 300 and np.abs(E - energy_history[-2]) < tol:
            if verbose:
                print "Converged"
            break

    return lambda_sum, y_hat_kappa, energy_history, iteration
Exemple #14
0
    def fit(self, X, Y, train_scorer, test_scorer, decompose='grid', w0=None):
        print('over unconstr begin')

        if decompose == 'general':
            contains_node, chains, edge_index = decompose_graph(X)
        elif decompose == 'grid':
            contains_node, chains, edge_index, sign = decompose_grid_graph(X, get_sign=True)
        else:
            raise ValueError

        y_hat = []
        lambdas = []
        multiplier = []
        for k in xrange(len(X)):
            n_nodes = X[k][0].shape[0]
            _y_hat = []
            _multiplier = []
            for p in xrange(n_nodes):
                _multiplier.append(1.0 / len(contains_node[k][p]))
            for chain in chains[k]:
                _y_hat.append(np.zeros(len(chain)))
            lambdas.append(np.zeros((n_nodes, self.n_states)))
            y_hat.append(_y_hat)
            _multiplier = np.array(_multiplier)
            _multiplier.shape = (n_nodes, 1)
            multiplier.append(_multiplier)

        w = np.zeros(self.size_w)
        self.w = w.copy()

        self.start_time = time.time()
        self.timestamps = [0]
        self.objective_curve = []


        history = {'train_scores': [],
                   'test_scores': [],
                   'objective': [],
                   'iteration': 0,
                   'w': []
                   }

        self.train_scorer = train_scorer
        self.test_scorer = test_scorer

        #x0 = np.zeros(self.size_w + 4000 * len(X))
        x0 = np.zeros(self.size_w)
        if w0 is not None:
            x0 = w0

#        l = 0.1
#        for iteration in xrange(100):
#            fval, grad = f2(w, self, X, Y, history)
#            w -= l * grad
#            if iteration:
#                l = 0.01 / iteration

        x, f_val, d = fmin_l_bfgs_b(f2, x0,
                                    args=(self, X, Y, history),
                                    maxiter=self.max_iter,
                                    disp=0,
                                    pgtol=1e-8)

        return w, history
Exemple #15
0
    def fit(self, X, Y, train_scorer, test_scorer, decompose='general'):
        self.logger.info('Initialization')

        if decompose == 'general':
            contains_node, chains, edge_index = decompose_graph(X)
        elif decompose == 'grid':
            contains_node, chains, edge_index = decompose_grid_graph(X)
        else:
            raise ValueError

        y_hat = []
        lambdas = []
        multiplier = []
        for k in xrange(len(X)):
            n_nodes = X[k][0].shape[0]
            _lambdas = []
            _y_hat = []
            _multiplier = []
            for p in xrange(n_nodes):
                _multiplier.append(1.0 / len(contains_node[k][p]))
            for chain in chains[k]:
                _lambdas.append(np.zeros((len(chain), self.n_states)))
                _y_hat.append(np.zeros(len(chain)))
            lambdas.append(_lambdas)
            y_hat.append(_y_hat)
            _multiplier = np.array(_multiplier)
            _multiplier.shape = (n_nodes, 1)
            multiplier.append(_multiplier)

        w = np.zeros(self.size_w)
        self.w = w.copy()

        self.start_time = time.time()
        self.timestamps = [0]
        self.objective_curve = []
        self.train_score = []
        self.test_score = []
        self.w_history = []

        learning_rate = 0.1

        for iteration in xrange(self.max_iter):
            self.logger.info('Iteration %d', iteration)
            self.logger.info('Optimize slave MRF and update w')

            objective = 0
            dw = np.zeros(w.shape)

            for k in xrange(len(X)):
                self.logger.info('object %d', k)
                x, y = X[k], Y[k]
                n_nodes = x[0].shape[0]

                unaries = self._loss_augment_unaries(self._get_unary_potentials(x, w), y.full, y.weights)
                unaries *= multiplier[k]

                pairwise = self._get_pairwise_potentials(x, w)

                objective += np.dot(w, self._joint_features_full(x, y.full))
                dw -= self._joint_features_full(x, y.full)

                for i in xrange(len(chains[k])):
                    y_hat[k][i], energy = optimize_chain(chains[k][i],
                                                         lambdas[k][i] + unaries[chains[k][i],:],
                                                         pairwise,
                                                         edge_index[k])

                    dw += self._joint_features(chains[k][i], x, y_hat[k][i], edge_index[k], multiplier[k])
                    objective -= energy

            dw -= w / self.C

            w += learning_rate * dw
            objective = self.C * objective + np.sum(w ** 2) / 2

            if iteration and (iteration % self.check_every == 0):
                self.logger.info('Compute train and test scores')
                self.train_score.append(train_scorer(w))
                self.logger.info('Train SCORE: %f', self.train_score[-1])
                self.test_score.append(test_scorer(w))
                self.logger.info('Test SCORE: %f', self.test_score[-1])

            self.logger.info('Update lambda')

            for k in xrange(len(X)):
                n_nodes = X[k][0].shape[0]
                lambda_sum = np.zeros((n_nodes, self.n_states), dtype=np.float64)

                for p in xrange(n_nodes):
                    for i in contains_node[k][p]:
                        pos = np.where(chains[k][i] == p)[0][0]
                        lambda_sum[p, y_hat[k][i][pos]] += multiplier[k][p]

                for i in xrange(len(chains[k])):
                    N = lambdas[k][i].shape[0]

                    lambdas[k][i][np.ogrid[:N], y_hat[k][i]] += learning_rate
                    lambdas[k][i] -= learning_rate * lambda_sum[chains[k][i],:]

            self.logger.info('diff: %f', np.sum((w-self.w)**2))
            if iteration:
                learning_rate = 1.0 / iteration

            self.timestamps.append(time.time() - self.start_time)
            self.objective_curve.append(objective)

            self.logger.info('Objective: %f', objective)

            self.w = w.copy()
            self.w_history.append(self.w)
        
        self.w = w

        self.timestamps = np.array(self.timestamps)
        self.objective_curve = np.array(self.objective_curve)
        self.train_score = np.array(self.train_score)
        self.test_score = np.array(self.test_score)
        self.w_history = np.vstack(self.w_history)
Exemple #16
0
    def fit(self, X, Y, train_scorer, test_scorer, decompose='general',
            use_latent_first_iter=500, undergenerating_weak=True, smd=False):
        self.logger.info('Initialization')

        if decompose == 'general':
            contains_node, chains, edge_index = decompose_graph(X)
        elif decompose == 'grid':
            contains_node, chains, edge_index = decompose_grid_graph(X)
        else:
            raise ValueError
    
        y_hat = []
        lambdas = []
        multiplier = []
        xx = []
        mu = {}
        for k in xrange(len(X)):
            x, y = X[k], Y[k]
            n_nodes = x[0].shape[0]
            xx.append(np.zeros(n_nodes))
            _lambdas = []
            _y_hat = []
            _multiplier = []
            for p in xrange(n_nodes):
                _multiplier.append(1.0 / len(contains_node[k][p]))
            for chain in chains[k]:
                _lambdas.append(np.zeros((len(chain), self.n_states)))
                _y_hat.append(np.zeros(len(chain), dtype=np.int32))
            lambdas.append(_lambdas)
            y_hat.append(_y_hat)
            _multiplier = np.array(_multiplier)
            _multiplier.shape = (n_nodes, 1)
            multiplier.append(_multiplier)
            if not y.full_labeled:
                mu[k] = np.zeros((n_nodes, self.n_states))

        w = np.zeros(self.size_w)
        self.w = w.copy()

        self.start_time = time.time()
        self.timestamps = [0]
        self.objective_curve = []
        self.train_score = []
        self.test_score = []
        self.w_history = []

        learning_rate1 = 0.1
        learning_rate2 = 0.1

        for iteration in xrange(self.max_iter):
            self.logger.info('Iteration %d', iteration)
            self.logger.info('Optimize slave MRF and update w')

            objective = 0
            dw = np.zeros(w.shape)

            for k in xrange(len(X)):
                x, y = X[k], Y[k]
                n_nodes = x[0].shape[0]

#                self.logger.info('object %d', k)

                if y.full_labeled:
                    unaries = self._loss_augment_unaries(self._get_unary_potentials(x, w),
                                                         y.full, y.weights)
                    unaries *= multiplier[k]
                    pairwise = self._get_pairwise_potentials(x, w)

                    jf = self._joint_features_full(x, y.full)
                    objective -= np.dot(w, jf)
                    dw -= jf

                    for i in xrange(len(chains[k])):
                        y_hat[k][i], energy = optimize_chain(chains[k][i],
                                                             lambdas[k][i] + unaries[chains[k][i],:],
                                                             pairwise,
                                                             edge_index[k])

                        dw += self._joint_features(chains[k][i], x, y_hat[k][i], edge_index[k], multiplier[k])

                        objective += energy
                elif iteration > use_latent_first_iter:
                    if undergenerating_weak:
# Use gco for full K oracle
#                        y_hat_, energy = self.loss_augmented_inference(x, y, w)
#                        jf_gt = self._joint_features_full(x, y.full)
#                        objective -= np.dot(w, jf_gt)
#                        objective += energy
#                        dw -= jf_gt
#                        dw += self._joint_features_full(x, y_hat_)

# use gco for first summand in DD
                        for mm in xrange(10):
                            dmu = np.zeros((n_nodes, self.n_states))

                            unaries = self._get_unary_potentials(x, w) - mu[k]
                            pairwise = self._get_pairwise_potentials(x, w)

                            y_hat_gco, energy = inference_gco(unaries, pairwise, self._get_edges(x),
                                                              n_iter=5, return_energy=True)
                            objective -= energy
                            dmu[np.ogrid[:dmu.shape[0]], y_hat_gco] -= 1
                            dw += self._joint_features_full(x, y_hat_gco)

                            jf = self._joint_features_full(x, y.full)
                            objective -= np.dot(w, jf)
                            dw -= jf

                            y_hat_kappa, energy = optimize_kappa(y, mu[k], self.alpha, n_nodes, self.n_states)
                            objective += energy
                            dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1

                            mu[k] -= learning_rate2 * dmu
                    elif not smd:
                        dmu = np.zeros((n_nodes, self.n_states))

                        unaries = (self._get_unary_potentials(x, w) - mu[k]) * multiplier[k]
                        pairwise = self._get_pairwise_potentials(x, w)

                        jf = self._joint_features_full(x, y.full)
                        objective -= np.dot(w, jf)
                        dw -= jf

#begin inner (can remove this to restore to previous state)
                        E = 0
                        Eprev = -100
                        for j in xrange(self.update_mu):
                            E = 0
                            for i in xrange(len(chains[k])):
                                y_hat[k][i], energy = optimize_chain(chains[k][i],
                                                                     lambdas[k][i] + unaries[chains[k][i],:],
                                                                     pairwise,
                                                                     edge_index[k])
                                E += energy


                            lambda_sum = np.zeros((n_nodes, self.n_states), dtype=np.float64)

                            for p in xrange(n_nodes):
                                for i in contains_node[k][p]:
                                    pos = np.where(chains[k][i] == p)[0][0]
                                    lambda_sum[p, y_hat[k][i][pos]] += multiplier[k][p]

                            for i in xrange(len(chains[k])):
                                N = lambdas[k][i].shape[0]

                                lambdas[k][i][np.ogrid[:N], y_hat[k][i]] -= learning_rate2
                                lambdas[k][i] += learning_rate2 * lambda_sum[chains[k][i],:]

                            if np.abs(E - Eprev) < 0.1:
                                break
                            Eprev = E
#end inner

#last one
                        for i in xrange(len(chains[k])):
                            y_hat[k][i], energy = optimize_chain(chains[k][i],
                                                                 lambdas[k][i] + unaries[chains[k][i],:],
                                                                 pairwise,
                                                                 edge_index[k])

                            dw += self._joint_features(chains[k][i], x, y_hat[k][i], edge_index[k], multiplier[k])

                            objective += energy

                            dmu[chains[k][i], y_hat[k][i]] -= multiplier[k][chains[k][i]].flatten()
#

                        y_hat_kappa, energy = optimize_kappa(y, mu[k], self.alpha, n_nodes, self.n_states)

                        objective += energy
                        dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1

                        mu[k] -= learning_rate2 * dmu
                    elif smd:
                        if iteration > 1500:
                            mMu = 10
                        else:
                            mMu = 1
                        for mm in xrange(mMu):
                            dmu = np.zeros((n_nodes, self.n_states))

                            jf = self._joint_features_full(x, y.full)
                            objective -= np.dot(w, jf)
                            dw -= jf

                            unaries = -self._get_unary_potentials(x, w) + mu[k]
                            edge_weights = -self._get_pairwise_potentials(x, w)
                            edges = self._get_edges(x)

                            n_edges = edges.shape[0]
                            y_hat2 = []
                            pairwise = []
                            for j in xrange(self.n_states):
                                y_hat2.append(np.zeros(self.n_states))
                                _pairwise = np.zeros((n_edges, 2, 2))
                                for i in xrange(n_edges):
                                    _pairwise[i,1,0] = _pairwise[i,0,1] = -0.5 * edge_weights[i,j,j]
                                pairwise.append(_pairwise)
                    
                            for i in xrange(n_edges):
                                e1, e2 = edges[i]
                                unaries[e1,:] += 0.5 * np.diag(edge_weights[i,:,:])
                                unaries[e2,:] += 0.5 * np.diag(edge_weights[i,:,:])
                    
                            xx[k], f_val, d = fmin_l_bfgs_b(f, xx[k],
                                                            args=(unaries, pairwise, edges),
                                                            maxiter=50,
                                                            maxfun=50,
                                                            pgtol=1e-2)
                                
                            E = np.sum(xx[k])
                            for j in xrange(self.n_states):
                                new_unaries = np.zeros((n_nodes, 2))
                                new_unaries[:,1] = unaries[:,j] + xx[k]
                                y_hat2[j], energy = binary_general_graph(edges, new_unaries, pairwise[j])
                                E -= 0.5*energy
                                dmu[:,j] -= y_hat2[j]

                                dw += self._joint_features_full(x, y_hat2[j] * j)
                    
                            y_hat_kappa, energy = optimize_kappa(y, mu[k], 1, n_nodes, self.n_states)
                            E += energy
                            dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1
                            objective += E
                    
                            mu[k] -= learning_rate2 * dmu

            dw += w / self.C

            if iteration < 100 or iteration % self.update_w_every == 0:
                w -= learning_rate1 * dw
            objective = self.C * objective + np.sum(w ** 2) / 2

            self.logger.info('Update lambda')

            for k in xrange(len(X)):
                if undergenerating_weak and not Y[k].full_labeled:
                    continue
                if smd and not Y[k].full_labeled:
                    continue

                n_nodes = X[k][0].shape[0]
                lambda_sum = np.zeros((n_nodes, self.n_states), dtype=np.float64)

                for p in xrange(n_nodes):
                    for i in contains_node[k][p]:
                        pos = np.where(chains[k][i] == p)[0][0]
                        lambda_sum[p, y_hat[k][i][pos]] += multiplier[k][p]

                for i in xrange(len(chains[k])):
                    N = lambdas[k][i].shape[0]

                    lambdas[k][i][np.ogrid[:N], y_hat[k][i]] -= learning_rate2
                    lambdas[k][i] += learning_rate2 * lambda_sum[chains[k][i],:]

            if iteration % self.complete_every == 0 or iteration in [51, 80, 101, 130]:
                self.logger.info('Complete latent variables')
                Y_new = Parallel(n_jobs=self.n_jobs, verbose=0, max_nbytes=1e8)(
                    delayed(latent)(self.model, x, y, w) for x, y in zip(X, Y))
                changes = np.sum([np.any(y_new.full != y.full) for y_new, y in zip(Y_new, Y)])
                self.logger.info('changes in latent variables: %d', changes)
                Y = Y_new

            if iteration and (iteration % self.check_every == 0):
                self.logger.info('Compute train and test scores')
                self.train_score.append(train_scorer(w))
                self.logger.info('Train SCORE: %f', self.train_score[-1])
                self.test_score.append(test_scorer(w))
                self.logger.info('Test SCORE: %f', self.test_score[-1])

            self.logger.info('diff: %f', np.sum((w-self.w)**2))
            if iteration:
                learning_rate1 = 1.0 / iteration
                learning_rate2 = 1.0 / iteration

            self.timestamps.append(time.time() - self.start_time)
            self.objective_curve.append(objective)

            self.logger.info('Objective: %f', objective)

            self.w = w.copy()
            self.w_history.append(self.w)
        
        self.w = w

        self.timestamps = np.array(self.timestamps)
        self.objective_curve = np.array(self.objective_curve)
        self.train_score = np.array(self.train_score)
        self.test_score = np.array(self.test_score)
        self.w_history = np.vstack(self.w_history)