Python SM Examples, scipy.sparse.SM Python Examples

Example #1

0

Show file

File: general_SVD_algs1.py Project: adielstatman/coreset_for_clustering

def Lloyd_iteration2( A,P, w ,Q):
    dists,Tags,_=squaredis(P,Q)
    print('finish squaredis')
    Qjl=SM((Q.shape[0],A.shape[1]))
    wq=np.zeros((Q.shape[0],1))
    w=np.reshape(w,(len(w),1))
    for i in range (Qjl.shape[0]):
            #print(i)
            inds=np.where(Tags==i)[0]  

            wmin=0
            wi=w[inds,:]-wmin
            Qjl[i,:]=(A[inds,:].multiply(wi)).sum(0)
            wq[i,:]=np.sum(wi,0)
    wq[wq==0]=1
    wqi=1/wq
    Qjl=Qjl.multiply(wqi+wmin)
    return SM(Qjl)

Example #2

0

Show file

File: general_SVD_algs1.py Project: adielstatman/coreset_for_clustering

def kmeans_plspls1(A,w,eps,V,clus_num,we,alfa_app,is_sparse,is_jl):
        """
        This funtion operates the kmeans++ initialization algorithm. each point chosed under the Sinus probability.
        Input:
            A: data matrix, n points, each on a sphere of dimension d.
            k: number of required points to find.
        Output:
            Cents: K initial centroids, each of a dimension d.
        """
        if is_sparse==1:
            A=SM(A)
        if is_jl==1:
            dex=int(clus_num*np.log(A.shape[0]))
    
            ran=np.random.randn(A.shape[1],dex)
            A=SM.dot(A,ran)
            is_sparse=0      #A=np.multiply(w1,A)
        num_of_samples = A.shape[0]
        if any(np.isnan(np.ravel(w)))+any(np.isinf(np.ravel(w))):
            Cents= A[np.random.choice(num_of_samples,size=1),:]   #choosing arbitrary point as the first               
        else: 
            w[w<0]=0               
            Cents= A[np.random.choice(num_of_samples,size=1,p=np.ravel(w)/np.sum(np.ravel(w))),:] #choosing arbitrary point as the first               
        if is_sparse==1:
            PA=make_P(A)
        else:
            PA=make_P_dense(A)
        fcost=alfa_app*1.1
        h1=1
        inds=[]
        while (Cents.shape[0]<clus_num+1):
            Cents2=Cents[h1-1:h1,:] 
            if is_sparse==1:
                Pmina,tags,_=squaredis(PA,Cents2)  
            else:
                Pmina,tags,_=squaredis_dense(PA,Cents2)  
            if h1==1:
                Pmin=Pmina
            else:
                Pmin=np.minimum(Pmin,Pmina)
                Pmin[np.asarray(inds)]=0
            Pmin[Pmin<0]=0
            Pmin00=np.multiply(w,Pmin)
            Pmin0=Pmin00/np.sum(Pmin00)
            if any(np.isnan(np.ravel(Pmin0)))+any(np.isinf(np.ravel(Pmin0))):
                ind=np.random.choice(Pmin.shape[0],1)
            else:
                Pmin0[Pmin0<0]=0
                ind=np.random.choice(Pmin.shape[0],1, p=Pmin0)
            if is_sparse==1:
                Cents=vstack((Cents,A[ind,:]))
            else:
                Cents=np.concatenate((Cents,A[ind,:]),0)
            inds.append(ind)
            h1=h1+1
        return Cents,inds

Example #3

0

Show file

File: general_SVD_algs1.py Project: adielstatman/coreset_for_clustering

def FBL_positive(P,u,B,Q,epsit,is_sparse=1):
     if is_sparse==0:    
         BB=make_P_dense(B)
         P0=make_P_dense(P)

         d1,tags,_=squaredis_dense(P0,B)
         d2,_,_=squaredis_dense(BB[tags,:],Q)
     else:
         BB=SM(make_P(B))
         P0=SM(make_P(P))

         d1,tags,_=squaredis(P0,B)
         d2,_,_=squaredis(BB[np.ravel(tags),:],Q)
     d=d1/epsit-d2
     print('d zeroing fraction',len(np.where(d<0)[0])/len(d))
  #   print('d',len(d))
 #    print('u',len(u))
#     print('P',len(P))
     u[np.where(d<0)[0]]=0
     return u

Example #4

0

Show file

File: tight_sen4proj.py Project: adielstatman/Tight_sen_proj

def alaa_coreset(wiki0,j,eps,w,is_pca,spar): 
    """
    our algorithm, equivalent to Algorithm 1 in the paper.
    input:
        wiki0:data matrix
        j: dimension of the approximated subspace
        eps: determine coreset size
        w: initial weights
        is_pca: 1 coreset for pca, 0 coreset dor SVD
        spar: is data in sparse format
    output:
        weighted coreset
    """
    coreset_size=j/eps
    dex=int(j*np.log(wiki0.shape[0]))
    d=wiki0.shape[1]
    if is_pca==1:
        j=j+1
        wiki0=PCA_to_SVD(wiki0,eps,spar)
    if is_jl==1:
        ran=np.random.randn(wiki0.shape[1],dex)
        if spar==1:
            wiki=SM.dot(wiki0,ran)	
        else:
            wiki=np.dot(wiki0,ran)	
    else:
        wiki=wiki0
    w=w/wiki.shape[0]
    sensetivities=[]
    jd=j
    w1=np.reshape(w,(len(w),1))
    wiki1=np.multiply(np.sqrt(w1),wiki)
    k=0
    for i,p in enumerate(wiki1) :
        k=k+1
        sensetivities.append(calc_sens(wiki1,p,jd,eps))
    
    p0=np.asarray(sensetivities)
    if is_pca==1:
        p0=p0+81*eps
    indec=np.random.choice(np.arange(wiki.shape[0]),int(coreset_size),p=p0/np.sum(p0)) #sampling according to the sensitivity
    p=p0/np.sum(p0) #normalizing sensitivies
    w=np.ones(wiki.shape[0])
    u=np.divide(np.sqrt(w),p)/coreset_size #caculating new weights
    u1=u[indec]#picking weights of sampled
    u1=np.reshape(u1,(len(u1),1))
    squ=np.sqrt(u1)   
    if spar==1:        
        C=SM(wiki0)[indec,:d].multiply(squ) #weighted coreset
    else:
        C=np.multiply(squ,wiki0[indec,:d])
    return C

Example #5

0

Show file

def SCNW_classic(A2, k, coreset_size, is_jl):
    coreset_size = int(coreset_size)
    """
    This function operates the CNW algorithm, exactly as elaborated in Feldman & Ras

    inputs:
    A: data matrix, n points, each of dimension d.
    k: an algorithm parameter which determines the normalization neededand the error given the coreset size.
    coreset_size: the maximal coreset size (number of lines inequal to zero) demanded for input.
    output:
    error: The error between the original data to the CNW coreset.        
    duration: the duration this CNW operation lasted
    """
    if is_jl == 1:
        dex = int(k * np.log(A2.shape[0]))

        ran = np.random.randn(A2.shape[1], dex)
        A1 = SM.dot(A2, ran)
    else:
        A1 = np.copy(A2)
    print('A1.shape', A1.shape)
    epsi = np.sqrt(k / coreset_size)  #
    A, A3 = initializing_data(A1, k)
    print('A.shape', A.shape)
    At = np.transpose(A)
    AtA = np.dot(At, A)
    num_of_channels = A.shape[1]
    ww = np.zeros((int(coreset_size)))
    Z = np.zeros((num_of_channels, num_of_channels))
    X_u = k * np.diag(np.ones(num_of_channels))
    X_l = -k * np.diag(np.ones(num_of_channels))
    delta_u = epsi + 2 * np.power(epsi, 2)
    delta_l = epsi - 2 * np.power(epsi, 2)
    ind = np.zeros(int(coreset_size), dtype=np.int)

    for j in range(coreset_size):
        if j % 50 == 1:
            print('j=', j)
        X_u = X_u + delta_u * AtA
        X_l = X_l + delta_l * AtA
        Z, jj, t = single_CNW_iteration_classic(A, At, delta_u, delta_l, X_u,
                                                X_l, Z)
        ww[j] = t
        ind[j] = jj
    sqrt_ww = np.sqrt(epsi * ww / k)
    sqrt_ww = np.reshape(sqrt_ww, (len(sqrt_ww), 1))
    if is_jl == 1:
        SA0 = SM(A2)[ind, :].multiply(sqrt_ww)
    else:
        SA0 = np.multiply(A2[ind, :], sqrt_ww)
    return SA0, ind

Example #6

0

Show file

File: tight_sen4proj.py Project: adielstatman/Tight_sen_proj

def Nonuniform(AA0,k,is_pca,eps,spar): 
        """
        non uniform sampling opponent to our algorithm, from
        Varadarajan, Kasturi, and Xin Xiao. "On the sensitivity of shape fitting problems." arXiv preprint arXiv:1209.4893 (2012).‏
        input:
            AA0:data matrix
            k: dimension of the approximated subspace
            is_pca: if 1 will provide a coreset to PCA, 0 will provide coreset for SVD
            eps: detemines coreset size
            spar: is data in sparse format
        output:
            weighted coreset
        """
        d=AA0.shape[1]
        if is_pca==1:
                k=k+1
                AA0=PCA_to_SVD(AA0,eps,spar)
        if is_jl==1:
            dex=int(k*np.log(AA0.shape[0]))
            ran=np.random.randn(AA0.shape[1],dex)
            if spar==1:
                AA=SM.dot(AA0,ran)
            else:
                AA=np.dot(AA0,ran)
        else:
            AA=AA0
        size_of_coreset=int(k+k/eps-1) 
        U,D,VT=ssp.linalg.svds(AA,k)       
        V = np.transpose(VT)
        AAV = np.dot(AA, V)
        del V
        del VT    
        x = np.sum(np.power(AA, 2), 1)
        y = np.sum(np.power(AAV, 2), 1)
        P = np.abs(x - y)
        AAV=np.concatenate((AAV,np.zeros((AAV.shape[0],1))),1)
        Ua, _, _ = ssp.linalg.svds(AAV,k)
        U = np.sum(np.power(Ua, 2), 1)
        pro = 2 * P / np.sum(P) + 8 * U
        if is_pca==1:
            pro=pro+81*eps
        pro0 = pro / sum(pro)
        w=np.ones(AA.shape[0])
        u=np.divide(w,pro0)/size_of_coreset
        DMM_ind=np.random.choice(AA.shape[0],size_of_coreset, p=pro0)
        u1=np.reshape(u[DMM_ind],(len(DMM_ind),1))
        if spar==1:
            SA0=SM(AA0)[DMM_ind,:d].multiply(np.sqrt(u1))
        else:
            SA0=np.multiply(np.sqrt(u1),AA0[DMM_ind,:d])
        return SA0

Example #7

0

Show file

File: general_SVD_algs1.py Project: adielstatman/coreset_for_clustering

def squaredis(P,Cent):    
    d=Cent.shape[1]
    C=SM((Cent.shape[0],d+2))    
    C[:,1]=1      #C is defined just as in the algorithm you sent me.
    C[:,0] =SM.sum(SM.power(Cent, 2), 1)
    C[:,2:d+2]=Cent
    D=SM.dot(P,C.T)
    D=D.toarray()
    Tags=D.argmin(1)#finding the most close centroid for each point 
    if min(D.shape)>1:
        dists=D.min(1)
    else:
        dists=np.ravel(D)
    y=D.argmin(0)
    return dists,Tags,y

Example #8

0

Show file

def Nonuniform_Alaa(AA0, k, is_pca, eps, spar):
    d = AA0.shape[1]
    if is_pca == 1:
        k = k + 1
        AA0 = PCA_to_SVD(AA0, eps, spar)
    if is_jl == 1:
        dex = int(k * np.log(AA0.shape[0]))
        ran = np.random.randn(AA0.shape[1], dex)
        if spar == 1:
            AA = SM.dot(AA0, ran)
        else:
            AA = np.dot(AA0, ran)
    else:
        AA = AA0

    size_of_coreset = int(k + k / eps - 1)
    U, D, VT = ssp.linalg.svds(AA, k)
    V = np.transpose(VT)
    print('spar', spar)
    print('is_jl', is_jl)
    AAV = np.dot(AA, V)
    del V
    del VT
    x = np.sum(np.power(AA, 2), 1)
    y = np.sum(np.power(AAV, 2), 1)
    P = np.abs(x - y)
    AAV = np.concatenate((AAV, np.zeros((AAV.shape[0], 1))), 1)
    Ua, _, _ = ssp.linalg.svds(AAV, k)
    U = np.sum(np.power(Ua, 2), 1)
    pro = 2 * P / np.sum(P) + 8 * U
    if is_pca == 1:
        pro = pro + 81 * eps
    pro0 = pro / sum(pro)
    w = np.ones(AA.shape[0])
    u = np.divide(w, pro0) / size_of_coreset
    DMM_ind = np.random.choice(AA.shape[0], size_of_coreset, p=pro0)
    u1 = np.reshape(u[DMM_ind], (len(DMM_ind), 1))
    if spar == 1:
        SA0 = SM(AA0)[DMM_ind, :d].multiply(np.sqrt(u1))

    else:
        SA0 = np.multiply(np.sqrt(u1), AA0[DMM_ind, :d])
    return pro0, SA0, 0, size_of_coreset  #,#AA.shape[0]*SA0/size_of_coreset

Example #9

0

Show file

def new_streaming(A,
                  k,
                  cs,
                  sizes,
                  is_sparse=0):  #k is the "thin" of the SVD in Alg1
    beg = time.time()
    d = A.shape[1]
    Y = np.arange(cs)
    if is_sparse == 1:
        PSI = SM((d, d))
    else:
        PSI = np.zeros((d, d))
    M = [[] for _ in range(int(cs))]  #M is list of 8*m lists of indeces of A
    ord1 = [[]
            for _ in range(int(cs))]  #M is list of 8*m lists of indeces of A
    B = [[] for _ in range(int(cs))]  #M is list of 8*m lists of indeces of A
    u = np.random.rand(10000, cs)  # a probability for each point
    Q = [0] * len(sizes)
    w = [0] * len(sizes)
    j = 0
    Times = []
    from tqdm import tqdm
    #B=A[0:1,:]
    for i in tqdm(range(1, A.shape[0])):
        # if np.mod(i,10000)==1:
        #    np.save(str(i)+'.npy',i)
        a = A[i:i + 1, :]
        for y in Y:
            M[y].append(i)
        for y in Y:
            B[y] = A[M[y], :]
        PSI, Q1, w1, s, M, time1 = stream(B, a, k, PSI, Y, M, ord1, i, u,
                                          is_sparse, beg)
        if i in sizes:
            if Q1 == []:
                Q[j] = A[np.random.choice(i, k + 1), :]
                w[j] = (i / (k + 1)) * np.ones((k + 1, 1))
            else:
                Q[j] = A[np.ravel(Q1), :]
                w[j] = w1
            j = j + 1
            Times.append(time1)
    return Q, w, Times

Example #10

0

Show file

def alaa_coreset(wiki0, j, eps, coreset_size, w, is_pca, spar):
    #print('1')
    dex = int(j * np.log(wiki0.shape[0]))
    d = wiki0.shape[1]
    if is_pca == 1:
        j = j + 1
        wiki0 = PCA_to_SVD(wiki0, eps, spar)
    wiki = wiki0
    w = w / wiki.shape[0]
    sensetivities = []
    jd = j
    w1 = np.reshape(w, (len(w), 1))
    wiki1 = np.multiply(np.sqrt(w1), wiki)
    k = 0
    for i, p in enumerate(wiki1):
        k = k + 1
        sensetivities.append(calc_sens(wiki1, p, jd, eps))
    p0 = np.asarray(sensetivities)
    if is_pca == 1:
        p0 = p0 + 81 * eps
    indec = np.random.choice(
        np.arange(wiki.shape[0]), int(coreset_size),
        p=p0 / np.sum(p0))  #sampling according to the sensitivity
    p = p0 / np.sum(p0)  #normalizing sensitivies
    w = np.ones(wiki.shape[0])

    u = np.divide(np.sqrt(w), p) / coreset_size  #caculating new weights
    u1 = u[indec]
    #u1=u1/np.mean(u1)
    u1 = np.reshape(u1, (len(u1), 1))
    squ = np.sqrt(u1)
    if spar == 1:
        C = SM(wiki0)[indec, :d].multiply(squ)
    else:

        C = np.multiply(squ, wiki0[indec, :d])
    return p, C, 0, u[
        indec], coreset_size  #,wiki.shape[0]*wiki[indec,:]/coreset_size#

Example #11

0

Show file

File: general_SVD_algs1.py Project: adielstatman/coreset_for_clustering

def k_means_clustering( A,  w ,K, iter_num,exp=1,ind=[],is_sparse=0,is_kline=0,): 

    if ind==[]:    
        ind=np.random.permutation(len(w))[0:K]
    Qnew=A[ind,:]
    P=make_P(A)
    dists1=0
    if (iter_num>=1)+(iter_num==0):
        for i in range(0,iter_num):
            Qnew=Lloyd_iteration2(A,P,w,Qnew) 
            dists0=dists1
            dists1,Tags1,tagss=squaredis(P,Qnew) 
            conv=np.abs(np.sum(np.multiply(w,dists0))-np.sum(np.multiply(w,dists1)))/np.sum(np.multiply(w,dists1))
            print('conv',conv)

    else:     
        Qjl=np.zeros(Qnew.shape)   
        dists0=0
        dists1,Tags1,tagss=squaredis(P,Qnew)    
        i=0        
        conv=np.abs(np.sum(np.multiply(w,dists0))-np.sum(np.multiply(w,dists1)))/np.sum(np.multiply(w,dists1))
        while conv>iter_num:
            Qjl=Qnew
            Qnew=Lloyd_iteration2(A,P,w,Qjl)    
            i=i+1      
            dists0=dists1
            dists1,Tags1,tagss=squaredis(P,Qnew)
            print(np.sum(np.multiply(w,dists1))/500)
            conv=np.abs(np.sum(np.multiply(w,dists0))-np.sum(np.multiply(w,dists1)))/np.sum(np.multiply(w,dists1))
            print('conv',i)
    print('&&&&&&',len(np.unique(tagss)))
    if exp==0:
        Q=SM(A)[tagss,:]
    else:
        Q=Qnew
    return Q,w

Example #12

0

Show file

File: FBL.py Project: adielstatman/coreset_for_clustering

is_partial = 0
num_of_cent_amount = 1  #don't touch, need to get to 480
exp_num = 7  #want to have 4 coreset sizes for the stds.
n = Data.shape[0]
w = np.ones(n)

iter_num = 0.01

d = Data.shape[1]
P2 = gsc.make_P_dense(Data)
num_of_lines = 4
num_of_r = 3
print('Hi')
k_freq = 50  #5 is noisy
beg0 = time.time()
B0, inds = gsc.kmeans_plspls1(SM(Data), w, 0, [], k_freq * num_of_cent_amount,
                              np.ravel(w), 0.01, 1, 0)
inds = np.ravel(np.asarray(inds))
B0 = B0.toarray()
print('prod B', type(B0))
print('prod B', time.time() - beg0)
exact = 1
error1 = np.zeros((num_of_lines + 1, num_of_cent_amount))
error2 = np.zeros((num_of_lines + 1, num_of_cent_amount))
error3 = np.zeros((num_of_lines + 1, num_of_cent_amount))
error4 = np.zeros((num_of_lines + 1, num_of_cent_amount))

num_of_clus = np.zeros(num_of_cent_amount)
error = np.zeros((num_of_r * num_of_lines + 1, exp_num))
erroro = np.zeros((num_of_r * num_of_lines + 1, exp_num))

Example #13

0

Show file

File: SVD_on_clus_small_data.py Project: adielstatman/coreset_for_SVD

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
num_of_css=5

is_pca=0 #0 for SVD, 1 for PCA 
is_sparse=1
Data=np.random.randn(100,11)
if is_sparse==0:

    mean_data=np.mean(Data,0)
    mean_data=np.reshape(mean_data,(1,len(mean_data)))

else:    
    Data1=SM(Data)
    mean_data=Data1.mean(0)
Datam=Data-np.dot(np.ones((Data.shape[0],1)),mean_data)
Data1=SM(Data)

n=Data.shape[0]
d=Data.shape[1]
opt_error=1
r=1
is_to_save=0
is_save=1

is_big_data=1
alg_list=[0,3,5]
t0=1
if datum==2:

Example #14

0

Show file

File: general_SVD_algs1.py Project: adielstatman/coreset_for_clustering

def old_clustering( A,w,alfa_app,eps,V, K,is_sparse,is_plspls=0,is_klinemeans=0):
        
        """
     
        inputs:
            A: data matrix, n points, each of dimension d.
            K: number of centroids demanded for the Kmeans.
            is_sparse: the  output SA0 will be: '0' the accurate cantroids, '1' the points that are the most close to the centroids.
            is_plspls: '1' to initialize with the kmeans++ algorithm which bounds the error, '0' random initialization.
            is_klinemeans:  '1' calculates klinemeans, '0' calculates Lloyd's kmeans.
        
        output:
            SA0: "ready coreset": a matrix of size K*d: coreset points multiplies by weights.
            GW1: weights
            Tags1: Data indices of the points chosen to coreset.
    
    """ 
        #sensitivity=0.01
        num_of_samples = A.shape[0]
        
        if is_klinemeans==1:
            if is_sparse==0:
                A1,weights1=nor_data(A)
            else:
                A1,weights1=nor_data1(A)
            weights1=np.reshape(weights1,(len(weights1),1))
            weights=np.multiply(w,weights1)
        else:
            if is_sparse==0:
                A1=np.copy(A)
            else:
                A1=SM.copy(A)
            weights=w
        print('A1',type(A1))
        print('A1',type(A1.shape[0]))
        print('A1',type(A1.shape[1]))

        num_of_samples = A1.shape[0]
        num_of_channels = A1.shape[1]
        K=int(K)
        if is_sparse==0:
            P=make_P_dense(A1)       
            Cent=np.zeros((K,num_of_channels))
        else:
            P=make_P(A1)       
            Centt=SM((K,num_of_channels))
        if is_plspls==1:
            Centt,per=kmeans_plspls1(A1,np.ravel(np.power(weights,2)),eps,V,K,np.power(weights,2),alfa_app,is_sparse,is_jl=0)            
        else:
            per=np.random.permutation(num_of_samples)
            #Cent[0:K,:]=A1[per[0:K],:]
        if is_sparse==0:
            #Cent=A1[np.ravel(per[0:K]),:]
            print('****per****',len(np.unique(per)))
            Cent=np.concatenate((A1[np.ravel(per[0:K]),:],A1[np.ravel(per[0:K]),:]),0)
        else:
            #Cent=vstack((A1[np.ravel(per[0:K]),:],A1[np.ravel(per[0:K]),:]))
            Cent=A1[np.ravel(per[0:K]),:]
            print('****per****',len(np.unique(per)))
        K1=Cent.shape[0]
    
        
        iter=0
        Cost=50 #should be just !=0
        old_Cost=2*Cost
    
        Tags=np.zeros((num_of_samples,1)) # a vector stores the cluster of each point
        print('c0s',Cent.shape)
        sensitivity=0.01
        it=0
        while np.logical_or(it<1,np.logical_and(min(Cost/old_Cost,old_Cost/Cost)<sensitivity,Cost>0.000001)): #the corrent cost indeed resuces relating the previous one, 
        #for i in range(10):
                            #however the loop continues until the reduction is not significantly and their ratio is close to one, and exceeds the parameter "sensitivity"    
            group_weights=np.zeros((K1,1))
            iter=iter+1 #counting the iterations. only for control
            old_Cost=Cost #the last calculated Cost becomes the old_Cost, and a new Cost is going to be calculated.
            if is_sparse==0:            
                Cent1=np.copy(Cent)
                Dmin,Tags,Tags1=squaredis_dense(P,Cent1)
            else:
                Cent1=SM.copy(Cent)
                Dmin,Tags,Tags1=squaredis(P,Cent1)
            #print('Tags',Tags)
            Cost=np.sum(Dmin) #the cost is the summation of all of the minimal distances
            for kk in range (1,K1+1):
                wheres=np.where(Tags==kk-1)  #finding the indeces of cluster k
                #print('wheres',weights[wheres[0]])
                weights2=np.power(weights[wheres[0]],1)  #finding the weights of cluster k
                group_weights[kk-1,:]=np.sum(weights2)
              
            it=it+1           
            
        GW1=np.power(group_weights,1)
        GW1=np.power(group_weights,1)

        print('***GW1***',len(np.where(GW1>0)[0]))
        F=Cent
        if is_sparse==0:
            
            SA0=np.multiply(GW1,F) #We may weight each group with its overall weight in ordet to compare it to the original data.   
        else:
            SA0=F.multiply(GW1)
#        print('SA0',SA0)
        return Cent,[],[]

Example #15

0

Show file

File: general_SVD_algs1.py Project: adielstatman/coreset_for_clustering

def clus_streaming(path,Data,j,is_pca,alg,h,spar,trial=None,datum=None,is_jl=1,gamma1=0.000000001):
    """
    alg=0 unif sampling
    alg=1 Sohler
    alg=2 CNW
    alg=3 Alaa
    """
    sizeB=j
    coreset_size=Data.shape[0]//(2**(h+1))
    k=0
    T_h= [0] * (h+1) #line 5
    DeltaT_h= [0] * (h+1) #line 4
    u_h=[0]* (h+1) #line 4
    leaf_ind=np.zeros(h+1)
    iter_num=1
    for jj in range(np.power(2,h)): #over all of the leaves
        w=np.ones(2*coreset_size)
        Q0=Data[k:k+2*coreset_size,:]       
        if alg>0: 
            B,inds= kmeans_plspls1(Q0,np.ravel(w),0,[],sizeB,np.ravel(w),0.01,1,0)
            Prob,partition,sum_weights_cluster=Coreset_FBL(Q0,w,B,1)
        if alg>1: 
            Q1,dists11=k_means_clustering(Q0,w,j,iter_num,inds)
        k=k+2*coreset_size
        print('k',k)
        #line 10
        if alg==0: 
            ind=np.random.choice(Q0.shape[0],coreset_size)
            T=Q0[ind,:]
            w=w[0]*np.ones((T.shape[0],1))#*2
        if alg==1:
            _,w,T=FBL(Q0,Q0,Prob,partition,sum_weights_cluster,w,inds,[],coreset_size,0,1,0)
            #w=w*2
        if alg==2:
            _,w,T=FBL(Q0,Q0,Prob,partition,sum_weights_cluster,w,inds,Q1,coreset_size,0,0,1,0.00001)
            #w=np.sqrt(w)
            print('w',w)
        if alg==3:
            _,w,T=FBL(Q0,Q0,Prob,partition,sum_weights_cluster,w,inds,Q1,coreset_size,0,0,1,0.3)
        if alg==4:
            _,w,T=FBL(Q0,Q0,Prob,partition,sum_weights_cluster,w,inds,Q1,coreset_size,0,1,1)
        DeltaT=0
        i=0                        
        u_h[0]=w
        # line 13
        while (i<h)*(type(T_h[i])!=int): #every time the leaf has a neighbor leaf it should merged and reduced
            wT=np.concatenate((w,np.asarray(u_h[i])),0) #line 14
            #line 15 union
            if spar==0:
               totT0=np.concatenate((T,np.asarray(T_h[i])),0)
            else: 
               totT0=vstack((T,T_h[i]))
            totT0=SM(totT0)
            #line 15
            if alg>0:
                B,inds= kmeans_plspls1(totT0,np.ravel(wT),0,[],sizeB,np.ravel(wT),0.01,1,0)
                Prob,partition,sum_weights_cluster=Coreset_FBL(totT0,wT,B,1)  
            if alg>2:
                Q1,dists11=k_means_clustering(totT0,wT,j,iter_num,inds)
            if alg==0:
                T=totT0[np.random.choice(totT0.shape[0],coreset_size),:]
                w=w[0]*np.ones((T.shape[0],1))#*2
            if alg==1:
                T1,w,T=FBL(totT0,totT0,Prob,partition,sum_weights_cluster,wT,inds,[],coreset_size,0,1,0)
                #w=w*2
            if alg==2:
                T1,w,T=FBL(totT0,totT0,Prob,partition,sum_weights_cluster,wT,inds,[],coreset_size,0,0,0)
                #w=np.sqrt(w)
            if alg==3:
                T1,w,T=FBL(totT0,totT0,Prob,partition,sum_weights_cluster,wT,inds,Q1,coreset_size,0,1,1)
            if alg==4:
                T1,w,T=FBL(totT0,totT0,Prob,partition,sum_weights_cluster,wT,inds,Q1,coreset_size,0,0,1)
            DeltaT=0  
            u_h[i]=0
            DeltaT=DeltaT+0 #zeroing leaf which reduced
            T_h[i]=0
            DeltaT_h[i]=0
            leaf_ind[i]=leaf_ind[i]+1
            i=i+1
        T_h[i]=T
        u_h[i]=w        
        T1=T.multiply(w)
        #saving all leaves
        if spar==0:            
            if datum==0:
                np.save(path+'leaves_gyro1/trial='+str(trial)+',j='+str(j)+',alg='+str(alg)+',floor='+str(i)+',leaf='+str(leaf_ind[i])+'.npy',T)
            if datum==1:
                np.save(path+'leaves_acc1/trial='+str(trial)+',j='+str(j)+',alg='+str(alg)+',floor='+str(i)+',leaf='+str(leaf_ind[i])+'.npy',T)
            if datum==2:
                np.save(path+'leaves_mnist/trial='+str(trial)+',j='+str(j)+',alg='+str(alg)+',floor='+str(i)+',leaf='+str(leaf_ind[i])+'.npy',T)
        else:
                ssp.save_npz(path+'trial='+str(trial)+',j='+str(j)+',alg='+str(alg)+',floor='+str(i)+',leaf='+str(leaf_ind[i])+'.npz',T)
                np.save(path+'trial='+str(trial)+',j='+str(j)+',alg='+str(alg)+',floor='+str(i)+',leaf='+str(leaf_ind[i])+'_weights.npy',w)
        DeltaT_h[i]=DeltaT
        Q=[]        
#    if type(T_h[h])==int: #should be remained only the upper one. if not:
    #all_levels=[]
#        for g in range (h+1): #collecting all leaves which remained on tree.
#            if type(T_h[g])!=int:
#                if all_levels==[]:
#                   all_levels=np.asarray(T_h[g])
#                else:
#                    all_levels=np.concatenate((all_levels,np.asarray(T_h[g])),0)
#        DeltaT_hs=sum(DeltaT_h[h]) #summing its delta
#    else:
#        all_levels=T_h[h] 
#        DeltaT_hs=DeltaT_h[h]
    return []

Example #16

0

Show file

File: general_SVD_algs1.py Project: adielstatman/coreset_for_clustering

def FBL(P0,P,Prob,partition,sum_weights_cluster,w,indsB,Q,coreset_size,is_not_sparse,full_sampling,posi,eps=0.1):
    Prob=Prob/np.sum(Prob)
    if is_not_sparse==0:
        P0=SM(P0)
    if full_sampling==1:
        ind=np.random.choice(np.arange(len(Prob)),coreset_size,p=np.ravel(Prob)) 
        u=np.divide(np.ravel(w),Prob)/coreset_size    
       #u[np.where(u=='nan')[0]==0]=0
        if posi==1:
            print('is_not_sparse',is_not_sparse)
            if is_not_sparse==0:
                u=FBL_positive(P0,u,P0[np.ravel(indsB),:],Q,eps,1-is_not_sparse)
            else:
                u=FBL_positive(P,u,P[np.ravel(indsB),:],Q,eps,1-is_not_sparse)
        u1=u[ind]
        print('uuuuuu',u[0:10])
        print('uuuuuu1',u1[0:10])

        u1=np.reshape(u1,(u1.shape[0],1))  
        
    else:

            #ind,u1=FBL_median(Prob,P,w,Q,P[np.ravel(indsB),:],partition,sum_weights_cluster,coreset_size,posi,1-is_not_sparse)
        ind=np.random.choice(np.arange(len(Prob)),coreset_size-len(indsB),p=np.ravel(Prob))
        u=np.divide(np.ravel(w),Prob)/coreset_size
        print('ttttuuuuuuttttt',len(u))
        ub=np.zeros(len(indsB))
        if is_not_sparse==1:
            PP0=make_P_dense(P0)
            _,tags,_=squaredis_dense(PP0,P0[indsB,:])
        else:
            PP0=make_P(P0)
            _,tags,_=squaredis(SM(PP0),SM(P0[np.ravel(indsB),:]))
        #print('taggggggggs',tags,len(tags))
        for i in range(len(indsB)):        
            inte=np.intersect1d(ind,np.where(tags==i)[0])
            #ubc[i]=np.sum(w[inte])
            ub[i]=np.sum(w[np.where(tags==i)[0]])-np.sum(u[inte])
            #ub[i]=np.sum(w[indsB[i]])
            #if indsB[i] in inte:
            #    ub[i]=ub[i]-u[indsB[i]]
        #ub=np.abs(ub)
        u1=np.concatenate((u[ind],ub))
        print('ttttuuuuuuttttt1',len(u1))

        if posi==1:
            print('is_not_sparse',is_not_sparse)
            if is_not_sparse==0:
                u1=FBL_positive(vstack((P0[ind,:],P0[np.ravel(indsB),:])),u1,P0[np.ravel(indsB),:],Q,eps,1-is_not_sparse)
            else:
                u1=FBL_positive(np.concatenate((P[ind,:],P[np.ravel(indsB),:]),0),u1,P0[np.ravel(indsB),:],Q,eps,1-is_not_sparse)

    ind=ind.astype(int)
    u1=np.reshape(u1,(len(u1),1))
        
    if full_sampling==0:
        if is_not_sparse==0:
            print('indsBra',np.ravel(indsB))
            print('indsBsh',np.ravel(indsB).shape)

            X=vstack((P0[np.ravel(ind),:],P0[np.ravel(indsB),:]))
        else:
            X=np.concatenate((P0[np.ravel(ind),:],P0[np.ravel(indsB),:]),0)
    else:
            X=P0[ind,:]
    if is_not_sparse==0:
            print(u1.shape)
            print(X.shape)

            C=X.multiply(u1[:X.shape[0],:])
    else:
           C=np.multiply(u1[:X.shape[0]],X)    
    print('Csh',C.shape[0])
    return C,u1[:X.shape[0]],X #for streaming flip X and C.