def train_ksne_lbfgs(X,Y,P,K,mf=10):
    N = X.shape[0]   # num points
    D = X.shape[1]   # input dimension

    Z = np.random.randn(N,P)  # points in embedded space

    T = sc.make_balanced_binary_tree(N)
    root_idx = N + T.shape[0]-1

    global_count_potential = np.zeros(N+1)
    global_count_potential[K] = 1
    only_global_constraint_dict = {}
    only_global_constraint_dict[root_idx] = global_count_potential

    res = lbfgs(sne_obj, A.flatten(), fprime=sne_grad, args=(X,Y,K,P,T,only_global_constraint_dict,roots), maxfun=mf, disp=1)

    return res[0].reshape(A.shape)
def train_sgd(X,Y,dataset_name,P,K,Z=None,num_iters=1,perplexity=5,batch_size=10,eta=0.1,mo=0,L2=0,class_to_keep=[],seed=1,distance='sqeuclidean',text_labels=None):
    """
X, Y: points and their respective labels 
dataset_name: the name given to this dataset
P: the dimensionality of the embedding space
K: the number of neighbors 
Z: coordinates of the points in embedded space
num_iters: number of passes over the set of points  
perplexity: perplexity of the distribution over datapoints. Check: 
            'Visualizing Data using t-SNE by van der Maaten and Hinton in JMLR (09) 2008', 
            for details. 
batch_size: size of mini batches
eta: learning rate
mo: momentum
L2: size of L2 regularizer
class_to_keep: subset of labels that have been kept (for naming results and figure files)
seed: the value of the seed (for naming results and figure files)
distance: distance metric {sqeuclidean, emd}
test_labels: alternative labels used in plots
    """

    N = X.shape[0]   # num points
    D = X.shape[1]   # input dimension

    ##################################################################################
    if Z is None:  Z = np.random.randn(N,P)*0.1  # points in embedded space

    if P == 2: # plot if embedding is in 2D
        plot_embedding(Z,Y,K,eta,mo,0,target_entropy=perplexity,iters=0,tot_iters=num_iters,class_to_keep=class_to_keep,seed=seed,dataset_name=dataset_name,text_labels=text_labels)

    if distance == 'sqeuclidean': 
        dijs = -squareform(pdist(X, 'sqeuclidean'))
    elif distance == 'emd': # earth mover's distance
        from emd.emddist import emddist
        dijs = -emddist(X)
    dijs = dist_normalize(dijs,Y,perplexity=perplexity,dataset_name=dataset_name) 
    dijs -= HUGE_VAL * np.eye(N)
    pijs = np.zeros((N,N))

    T = sc.make_balanced_binary_tree(N)
    root_idx = N + T.shape[0]-1
    print "Root idx", root_idx

    global_count_potential = np.zeros(N+1)
    global_count_potential[K] = 1
    only_global_constraint_dict = {}
    only_global_constraint_dict[root_idx] = global_count_potential

    # Precompute E_i[y_j] (or pijs)
    print 'precomute E_i[y_j]'

    from chain.chain_alg_wrapper import ChainAlg
    chain = ChainAlg(N, Kmin=K, Kmax=K, minibatch_size = N)
    pot = dijs
    #exp_pot = np.exp(dijs)
    marginals, samples, logZs = chain.infer(pot)
    pijs = marginals


    debug = False
    if debug:
        pijs_debug = pijs.copy()
        for i in xrange(N):
            thetas = dijs[i,:]
            #print np.exp(thetas)
            node_margs, count_margs, log_Z = \
                sc.conv_tree(np.exp(thetas), only_global_constraint_dict, T, use_fft=False)
            pijs_debug[i,:] = node_margs

        diff = abs(pijs - pijs_debug).max() 
        assert diff < 1e-8
        print 'precomputation of marginals is correct!'


    ##################################################################################

    num_batches = np.ceil(np.double(N)/batch_size)
    randIndices = np.random.permutation(X.shape[0])

    print 'num_batches = %s'  % num_batches

    dZ = np.zeros(Z.shape)
    V = dZ*0
    for i in range(num_iters):
        f_tot = 0
        for batch in range(int(num_batches)):
            print "iteration " + str(i+1) + " batch " + str(batch+1) + " of " + str(int(num_batches))
            ind = randIndices[np.mod(range(batch*batch_size,(batch+1)*batch_size),X.shape[0])]
            f_tot += ksne_obj(Z.flatten(),X,K,P,pijs,only_global_constraint_dict,T)


            g = ksne_grad((Z+V*mo).flatten(),X,K,P,pijs,only_global_constraint_dict,T).reshape(Z.shape)

            #dZ = mo*dZ - eta*g
            V = V*mo + eta*(g-L2*(Z+V*mo))
            #Z -= dZ
            Z += V
        print 'objective: %s, |g|=%s' % (str(f_tot/num_batches), abs(g).mean())
        if P == 2: # plot if embedding is in 2D
            plot_embedding(Z,Y,K,eta,mo,f_tot/num_batches,target_entropy=perplexity,iters=i+1,tot_iters=num_iters,class_to_keep=class_to_keep,seed=seed,dataset_name=dataset_name,text_labels=text_labels)

    return Z