def cpp_sparse_kernel(): content = np.array(subcp.create_kernel(dataArray.tolist(), "euclidean", num_neighbors)) val = content[0] row = list(map(lambda arg: int(arg), content[1])) col = list(map(lambda arg: int(arg), content[2])) sijs = sparse.csr_matrix((val, (row, col)), [num_samples,num_samples]) return sijs
def cpp_dense_kernel(): content = np.array(subcp.create_kernel(dataArray.tolist(), "euclidean", np.shape(dataArray)[0])) val = content[0] row = list(map(lambda arg: int(arg), content[1])) col = list(map(lambda arg: int(arg), content[2])) sijs = np.zeros((num_samples,num_samples)) sijs[row,col] = val return sijs
def test_cosine_neigh3(self, data, val): c = subcp.create_kernel(data.tolist(),'cosine',3) value = c[0] row = list(map(lambda arg: int(arg), c[1])) col = list(map(lambda arg: int(arg), c[2])) s = np.zeros((np.shape(data)[0],np.shape(data)[0])) s[row, col] = value assert np.allclose(s, val)
def test_cosine_full(self, data): CS = cosine_similarity(data) #sklearn ground truth num_neigh=np.shape(data)[0] c = subcp.create_kernel(data.tolist(),'cosine',num_neigh) value = c[0] row = list(map(lambda arg: int(arg), c[1])) col = list(map(lambda arg: int(arg), c[2])) s = np.zeros((np.shape(data)[0],np.shape(data)[0])) s[row, col] = value assert np.allclose(s, CS)
def test_euclidean_full(self, data): ED = euclidean_distances(data) gamma = 1/np.shape(data)[1] ES = np.exp(-ED* gamma) #sklearn ground truth num_neigh=np.shape(data)[0] c = subcp.create_kernel(data.tolist(),'euclidean',num_neigh) value = c[0] row = list(map(lambda arg: int(arg), c[1])) col = list(map(lambda arg: int(arg), c[2])) s = np.zeros((np.shape(data)[0],np.shape(data)[0])) s[row, col] = value assert np.allclose(s, ES)
def __init__(self, n, mode, lambdaVal, separate_rep=None, n_rep=None, mgsijs=None, ggsijs=None, data=None, data_rep=None, metric="cosine", num_neighbors=None): self.n = n self.mode = mode self.lambdaVal = lambdaVal self.separate_rep = separate_rep self.n_rep = n_rep self.mgsijs = mgsijs self.ggsijs = ggsijs self.data = data self.data_rep = data_rep self.metric = metric self.num_neighbors = num_neighbors self.clusters = None self.cluster_sijs = None self.cluster_map = None self.cpp_obj = None self.cpp_ggsijs = None self.cpp_mgsijs = None self.cpp_ground_sub = { -1 } #Provide a dummy set for pybind11 binding to be successful self.cpp_content = None self.effective_ground = None if self.n <= 0: raise Exception( "ERROR: Number of elements in ground set must be positive") if self.mode not in ['dense', 'sparse']: raise Exception( "ERROR: Incorrect mode. Must be one of 'dense' or 'sparse'") # if self.metric not in ['euclidean', 'cosine']: # raise Exception("ERROR: Unsupported metric. Must be 'euclidean' or 'cosine'") if self.separate_rep == True: if self.n_rep is None or self.n_rep <= 0: raise Exception( "ERROR: separate represented intended but number of elements in represented not specified or not positive" ) if self.mode != "dense": raise Exception( "Only dense mode supported if separate_rep = True") if (type(self.mgsijs) != type(None)) and (type(self.mgsijs) != np.ndarray): raise Exception("mgsijs provided, but is not dense") if (type(self.ggsijs) != type(None)) and (type(self.ggsijs) != np.ndarray): raise Exception("ggsijs provided, but is not dense") if mode == "dense": if self.separate_rep == True: if type(self.mgsijs) == type(None): #not provided mgsij - make it if (type(data) == type(None)) or (type(data_rep) == type(None)): raise Exception("Data missing to compute mgsijs") if np.shape(self.data)[0] != self.n or np.shape( self.data_rep)[0] != self.n_rep: raise Exception( "ERROR: Inconsistentcy between n, n_rep and no of examples in the given ground data matrix and represented data matrix" ) self.mgsijs = np.array( subcp.create_kernel_NS(self.data.tolist(), self.data_rep.tolist(), self.metric)) else: #provided mgsijs - verify it's dimensionality if np.shape(self.mgsijs)[1] != self.n or np.shape( self.mgsijs)[0] != self.n_rep: raise Exception( "ERROR: Inconsistency between n_rep, n and no of rows, columns of given mg kernel" ) if type(self.ggsijs) == type(None): #not provided ggsijs - make it if type(data) == type(None): raise Exception("Data missing to compute ggsijs") if self.num_neighbors is not None: raise Exception( "num_neighbors wrongly provided for dense mode") self.num_neighbors = np.shape( self.data )[0] #Using all data as num_neighbors in case of dense mode self.cpp_content = np.array( subcp.create_kernel(self.data.tolist(), self.metric, self.num_neighbors)) val = self.cpp_content[0] row = list(self.cpp_content[1].astype(int)) col = list(self.cpp_content[2].astype(int)) self.ggsijs = np.zeros((n, n)) self.ggsijs[row, col] = val else: #provided ggsijs - verify it's dimensionality if np.shape(self.ggsijs)[0] != self.n or np.shape( self.ggsijs)[1] != self.n: raise Exception( "ERROR: Inconsistentcy between n and dimensionality of given similarity gg kernel" ) else: if (type(self.ggsijs) == type(None)) and (type(self.mgsijs) == type(None)): #no kernel is provided make ggsij kernel if type(data) == type(None): raise Exception("Data missing to compute ggsijs") if self.num_neighbors is not None: raise Exception( "num_neighbors wrongly provided for dense mode") self.num_neighbors = np.shape( self.data )[0] #Using all data as num_neighbors in case of dense mode self.cpp_content = np.array( subcp.create_kernel(self.data.tolist(), self.metric, self.num_neighbors)) val = self.cpp_content[0] row = list(self.cpp_content[1].astype(int)) col = list(self.cpp_content[2].astype(int)) self.ggsijs = np.zeros((n, n)) self.ggsijs[row, col] = val elif (type(self.ggsijs) == type(None)) and (type(self.mgsijs) != type(None)): #gg is not available, mg is - good #verify that it is dense and of correct dimension if (type(self.mgsijs) != np.ndarray) or np.shape( self.mgsijs)[1] != self.n or np.shape( self.mgsijs)[0] != self.n: raise Exception( "ERROR: Inconsistency between n and no of rows, columns of given kernel" ) self.ggsijs = self.mgsijs elif (type(self.ggsijs) != type(None)) and (type(self.mgsijs) == type(None)): #gg is available, mg is not - good #verify that it is dense and of correct dimension if (type(self.ggsijs) != np.ndarray) or np.shape( self.ggsijs)[1] != self.n or np.shape( self.ggsijs)[0] != self.n: raise Exception( "ERROR: Inconsistency between n and no of rows, columns of given kernel" ) else: #both are available - something is wrong raise Exception( "Two kernels have been wrongly provided when separate_rep=False" ) elif mode == "sparse": if self.separate_rep == True: raise Exception( "Separate represented is supported only in dense mode") if self.num_neighbors is None or self.num_neighbors <= 0: raise Exception( "Valid num_neighbors is needed for sparse mode") if (type(self.ggsijs) == type(None)) and (type(self.mgsijs) == type(None)): #no kernel is provided make ggsij sparse kernel if type(data) == type(None): raise Exception("Data missing to compute ggsijs") self.cpp_content = np.array( subcp.create_kernel(self.data.tolist(), self.metric, self.num_neighbors)) val = self.cpp_content[0] row = list(self.cpp_content[1].astype(int)) col = list(self.cpp_content[2].astype(int)) self.ggsijs = sparse.csr_matrix((val, (row, col)), [n, n]) elif (type(self.ggsijs) == type(None)) and (type(self.mgsijs) != type(None)): #gg is not available, mg is - good #verify that it is sparse if type(self.mgsijs) != scipy.sparse.csr.csr_matrix: raise Exception("Provided kernel is not sparse") self.ggsijs = self.mgsijs elif (type(self.ggsijs) != type(None)) and (type(self.mgsijs) == type(None)): #gg is available, mg is not - good #verify that it is dense and of correct dimension if type(self.ggsijs) != scipy.sparse.csr.csr_matrix: raise Exception("Provided kernel is not sparse") else: #both are available - something is wrong raise Exception( "Two kernels have been wrongly provided when separate_rep=False" ) if self.separate_rep == None: self.separate_rep = False if self.mode == "dense" and self.separate_rep == False: self.cpp_ggsijs = self.ggsijs.tolist( ) #break numpy ndarray to native list of list datastructure if type(self.cpp_ggsijs[0]) == int or type( self.cpp_ggsijs[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(self.cpp_ggsijs) self.cpp_ggsijs = l self.cpp_obj = GraphCut(self.n, self.cpp_ggsijs, False, self.cpp_ground_sub, self.lambdaVal) elif self.mode == "dense" and self.separate_rep == True: self.cpp_ggsijs = self.ggsijs.tolist( ) #break numpy ndarray to native list of list datastructure if type(self.cpp_ggsijs[0]) == int or type( self.cpp_ggsijs[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(self.cpp_ggsijs) self.cpp_ggsijs = l self.cpp_mgsijs = self.mgsijs.tolist( ) #break numpy ndarray to native list of list datastructure if type(self.cpp_mgsijs[0]) == int or type( self.cpp_mgsijs[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(self.cpp_mgsijs) self.cpp_mgsijs = l self.cpp_obj = GraphCut(self.n, self.cpp_mgsijs, self.cpp_ggsijs, self.lambdaVal) elif self.mode == "sparse": self.cpp_ggsijs = {} self.cpp_ggsijs['arr_val'] = self.ggsijs.data.tolist( ) #contains non-zero values in matrix (row major traversal) self.cpp_ggsijs['arr_count'] = self.ggsijs.indptr.tolist( ) #cumulitive count of non-zero elements upto but not including current row self.cpp_ggsijs['arr_col'] = self.ggsijs.indices.tolist( ) #contains col index corrosponding to non-zero values in arr_val self.cpp_obj = GraphCut(self.n, self.cpp_ggsijs['arr_val'], self.cpp_ggsijs['arr_count'], self.cpp_ggsijs['arr_col'], lambdaVal) else: raise Exception("Invalid") self.effective_ground = self.cpp_obj.getEffectiveGroundSet()
def __init__(self, n, num_privates, lambdaVal, data_sijs=None, private_sijs=None, private_private_sijs=None, data=None, privateData=None, metric="cosine", privacyHardness=1): self.n = n self.num_privates = num_privates self.lambdaVal = lambdaVal self.metric = metric self.privacyHardness = privacyHardness self.data_sijs = data_sijs self.private_sijs = private_sijs self.private_private_sijs = private_private_sijs self.data = data self.privateData = privateData self.cpp_obj = None self.cpp_data_sijs = None self.cpp_private_sijs = None self.cpp_private_private_sijs = None self.cpp_content = None self.cpp_content2 = None self.effective_ground = None if self.n <= 0: raise Exception( "ERROR: Number of elements in ground set must be positive") if self.num_privates < 0: raise Exception("ERROR: Number of queries must be >= 0") # if self.metric not in ['euclidean', 'cosine']: # raise Exception("ERROR: Unsupported metric. Must be 'euclidean' or 'cosine'") if (type(self.data_sijs) != type(None)) and (type( self.private_sijs) != type(None)) and ( type(self.private_private_sijs) != type(None)): # User has provided all three kernels if type(self.data_sijs) != np.ndarray: raise Exception( "Invalid data kernel type provided, must be ndarray") if type(self.private_sijs) != np.ndarray: raise Exception( "Invalid private kernel type provided, must be ndarray") if type(self.private_private_sijs) != np.ndarray: raise Exception( "Invalid private-private kernel type provided, must be ndarray" ) if np.shape(self.data_sijs)[0] != self.n or np.shape( self.data_sijs)[1] != self.n: raise Exception("ERROR: data kernel should be n X n") if np.shape(self.private_sijs)[0] != self.n or np.shape( self.private_sijs)[1] != self.num_privates: raise Exception( "ERROR: Private Kernel should be n X num_privates") if np.shape(self.private_private_sijs )[0] != self.num_privates or np.shape( self.private_private_sijs)[1] != self.num_privates: raise Exception( "ERROR: Private-private Kernel should be num_privates X num_privates" ) if (type(self.data) != type(None)) or (type(self.privateData) != type(None)): print( "WARNING: similarity kernels found. Provided data and private matrices will be ignored." ) else: #similarity kernels have not been provided if (type(self.data) == type(None)) or (type(self.privateData) == type(None)): raise Exception( "Since kernels are not provided, data matrices are a must") if np.shape(self.data)[0] != self.n: raise Exception( "ERROR: Inconsistentcy between n and no of examples in the given data matrix" ) if np.shape(self.privateData)[0] != self.num_privates: raise Exception( "ERROR: Inconsistentcy between num_privates and no of examples in the given private data matrix" ) #construct imageKernel self.num_neighbors = self.n #Using all data as num_neighbors in case of dense mode self.cpp_content = np.array( subcp.create_kernel(self.data.tolist(), self.metric, self.num_neighbors)) val = self.cpp_content[0] row = list(self.cpp_content[1].astype(int)) col = list(self.cpp_content[2].astype(int)) self.data_sijs = np.zeros((self.n, self.n)) self.data_sijs[row, col] = val #construct privateKernel self.private_sijs = np.array( subcp.create_kernel_NS(self.privateData.tolist(), self.data.tolist(), self.metric)) #construct privatePrivateKernel self.num_neighbors2 = self.num_privates #Using all data as num_neighbors in case of dense mode self.cpp_content2 = np.array( subcp.create_kernel(self.privateData.tolist(), self.metric, self.num_neighbors2)) val2 = self.cpp_content2[0] row2 = list(self.cpp_content2[1].astype(int)) col2 = list(self.cpp_content2[2].astype(int)) self.private_private_sijs = np.zeros( (self.num_privates, self.num_privates)) self.private_private_sijs[row2, col2] = val2 #Breaking similarity matrix to simpler native data structures for implicit pybind11 binding self.cpp_data_sijs = self.data_sijs.tolist( ) #break numpy ndarray to native list of list datastructure if type(self.cpp_data_sijs[0]) == int or type( self.cpp_data_sijs[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(self.cpp_data_sijs) self.cpp_data_sijs = l self.cpp_private_sijs = self.private_sijs.tolist( ) #break numpy ndarray to native list of list datastructure if type(self.cpp_private_sijs[0]) == int or type( self.cpp_private_sijs[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(self.cpp_private_sijs) self.cpp_private_sijs = l #Breaking similarity matrix to simpler native data structures for implicit pybind11 binding self.cpp_private_private_sijs = self.private_private_sijs.tolist( ) #break numpy ndarray to native list of list datastructure if type(self.cpp_private_private_sijs[0]) == int or type( self.cpp_private_private_sijs[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(self.cpp_private_private_sijs) self.cpp_private_private_sijs = l self.cpp_obj = LogDeterminantConditionalGain( self.n, self.num_privates, self.cpp_data_sijs, self.cpp_private_sijs, self.cpp_private_private_sijs, self.lambdaVal, self.privacyHardness) self.effective_ground = set(range(n))
def __init__(self, n, n_master=-1, sijs=None, data=None, data_master=None, cluster_lab=None, mode=None, metric="cosine", num_neigh=-1, num_cluster=None, partial=False, ground_sub=None): self.n = n self.n_master = n_master self.mode = mode self.metric = metric self.sijs = sijs self.data = data self.data_master=data_master self.num_neigh = num_neigh self.partial = partial self.ground_sub = ground_sub self.seperateMaster=False self.clusters=None self.cluster_sijs=None self.cluster_map=None self.cluster_lab=cluster_lab self.num_cluster=num_cluster self.cpp_obj = None self.cpp_sijs = None self.cpp_ground_sub = ground_sub self.cpp_content = None if self.n==0: raise Exception("ERROR: Number of elements in ground set can't be 0") if self.partial==True and self.ground_sub==None: raise Exception("ERROR: Ground subset not specified") if mode!=None and mode not in ['dense', 'sparse', 'clustered']: raise Exception("ERROR: Incorrect mode") if metric not in ['euclidean', 'cosine']: raise Exception("ERROR: Unsupported metric") if type(self.sijs)!=type(None): # User has provided sim matrix directly: simply consume it if np.shape(self.sijs)[0]!=self.n: raise Exception("ERROR: Inconsistentcy between n and no of examples in the given similarity matrix") if type(self.sijs) == scipy.sparse.csr.csr_matrix and num_neigh==-1: raise Exception("ERROR: num_neigh for given sparse matrix not provided") if self.mode!=None: # Ensure that there is no inconsistency in similarity matrix and provided mode if type(self.sijs) == np.ndarray and self.mode!="dense": print("WARNING: Incorrect mode provided for given similarity matrix, changing it to dense") self.mode="dense" if type(self.sijs) == scipy.sparse.csr.csr_matrix and self.mode!="sparse": print("WARNING: Incorrect mode provided for given similarity matrix, changing it to sparse") self.mode="sparse" else: # Infer mode from similarity matrix if type(self.sijs) == np.ndarray: self.mode="dense" if type(self.sijs) == scipy.sparse.csr.csr_matrix: self.mode="sparse" else: if type(self.data)!=type(None): # User has only provided data: build similarity matrix/cluster-info and consume it if np.shape(self.data)[0]!=self.n: raise Exception("ERROR: Inconsistentcy between n and no of examples in the given data matrix") if type(self.data_master)!=type(None): self.seperateMaster=True if np.shape(self.data_master)[0]!=self.n_master: raise Exception("ERROR: Inconsistentcy between n_master and no of examples in the given data_master matrix") if self.mode=="sparse" or self.mode=="cluster": raise Exception("ERROR: mode can't be sparse or cluster if ground and master datasets are different") if partial==True: raise Exception("ERROR: partial can't be True if ground and master datasets are different") if self.mode==None: self.mode="sparse" if self.num_neigh==-1 and self.seperateMaster==False: self.num_neigh=np.shape(self.data)[0] #default is total no of datapoints if self.mode=="clustered": self.clusters, self.cluster_sijs, self.cluster_map = create_cluster(self.data.tolist(), self.metric, self.cluster_lab, self.num_cluster) else: if self.seperateMaster==True: #mode in this case will always be dense self.sijs = np.array(subcp.create_kernel_NS(self.data.tolist(),self.data_master.tolist(), self.metric)) else: self.cpp_content = np.array(subcp.create_kernel(self.data.tolist(), self.metric, self.num_neigh)) val = self.cpp_content[0] row = list(map(lambda arg: int(arg), self.cpp_content[1])) col = list(map(lambda arg: int(arg), self.cpp_content[2])) if self.mode=="dense": self.sijs = np.zeros((n,n)) self.sijs[row,col] = val if self.mode=="sparse": self.sijs = sparse.csr_matrix((val, (row, col)), [n,n]) else: raise Exception("ERROR: Neither data nor similarity matrix provided") if self.partial==False: self.cpp_ground_sub = {-1} #Provide a dummy set for pybind11 binding to be successful #Breaking similarity matrix to simpler native data sturctures for implicit pybind11 binding if self.mode=="dense": self.cpp_sijs = self.sijs.tolist() #break numpy ndarray to native list of list datastructure if type(self.cpp_sijs[0])==int or type(self.cpp_sijs[0])==float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l=[] l.append(self.cpp_sijs) self.cpp_sijs=l if np.shape(self.cpp_sijs)[0]!=np.shape(self.cpp_sijs)[1] and self.seperateMaster==False: #TODO: relocate this check to some earlier part of code raise Exception("ERROR: Dense similarity matrix should be a square matrix if ground and master datasets are same") self.cpp_obj = FacilityLocation(self.n, self.mode, self.cpp_sijs, self.num_neigh, self.partial, self.cpp_ground_sub, self.seperateMaster) if self.mode=="sparse": #break scipy sparse matrix to native component lists (for csr implementation) self.cpp_sijs = {} self.cpp_sijs['arr_val'] = self.sijs.data.tolist() #contains non-zero values in matrix (row major traversal) self.cpp_sijs['arr_count'] = self.sijs.indptr.tolist() #cumulitive count of non-zero elements upto but not including current row self.cpp_sijs['arr_col'] = self.sijs.indices.tolist() #contains col index corrosponding to non-zero values in arr_val self.cpp_obj = FacilityLocation(self.n, self.mode, self.cpp_sijs['arr_val'], self.cpp_sijs['arr_count'], self.cpp_sijs['arr_col'], self.num_neigh, self.partial, self.cpp_ground_sub) if self.mode=="clustered": l_temp = [] for el in self.cluster_sijs: temp=el.tolist() if type(temp[0])==int or type(temp[0])==float: l=[] l.append(temp) temp=l l_temp.append(temp) self.cluster_sijs = l_temp self.cpp_obj = FacilityLocation(self.n, self.mode, self.clusters, self.cluster_sijs, self.cluster_map, self.num_neigh, self.partial, self.cpp_ground_sub) self.cpp_ground_sub=self.cpp_obj.getEffectiveGroundSet() self.ground_sub=self.cpp_ground_sub
import submodlib_cpp as subcp import numpy as np from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import euclidean_distances from time import time data = np.array([[0, 1, 3], [5, 1, 5], [10, 2, 6], [12, 20, 68]]) num_neigh = 2 t = time() CS = cosine_similarity(data) print("sklearn:", time() - t) print(CS.tolist()) t = time() c = subcp.create_kernel(data.tolist(), 'cosine', num_neigh) val = c[0] row = list(map(lambda arg: int(arg), c[1])) col = list(map(lambda arg: int(arg), c[2])) s = np.zeros((4, 4)) s[row, col] = val print("cpp:", time() - t) #Atleast 100 times faster print(s) #print(np.allclose(CS,s)) t = time() ED = euclidean_distances(data) gamma = 1 / np.shape(data)[1] ES = np.exp(-ED * gamma) #sklearn ground truth print("sklearn:", time() - t)
def __init__(self, n, f_name, data, mode, cluster_lab=None, num_clusters=None, metric="cosine", lambdaVal=1): self.n = n self.f_name = f_name self.num_clusters = num_clusters self.data = data self.mode = mode self.cluster_lab = cluster_lab self.metric = metric self.clusters = None self.cluster_sijs = None self.cluster_map = None self.sijs = None self.cpp_content = None self.cpp_sijs = None self.effective_ground = None self.lambdaVal = lambdaVal if self.n <= 0: raise Exception( "ERROR: Number of elements in ground set must be positive") if self.mode not in ['single', 'multi']: raise Exception( "ERROR: Incorrect mode. Must be one of 'single' or 'multi'") # if self.metric not in ['euclidean', 'cosine']: # raise Exception("ERROR: Unsupported metric") if type(self.cluster_lab) != type(None) and ( self.num_clusters is None or self.num_clusters <= 0): raise Exception( "ERROR: Positive number of clusters must be provided when cluster_lab is provided" ) if type(self.cluster_lab) != type(None) and len( self.cluster_lab) != self.n: raise Exception( "ERROR: cluster_lab's size is NOT same as ground set size") if type(self.cluster_lab) != type(None) and not all( ele >= 0 and ele <= self.num_clusters - 1 for ele in self.cluster_lab): raise Exception("Cluster IDs/labels contain invalid values") if np.shape(self.data)[0] != self.n: raise Exception( "ERROR: Inconsistentcy between n and no of examples in the given ground data matrix" ) if mode == "single": self.clusters, _, _ = create_cluster_kernels(self.data.tolist(), self.metric, self.cluster_lab, self.num_clusters, onlyClusters=True) self.num_neighbors = np.shape(self.data)[ 0] #Using all data as num_neighbors in case of dense mode self.cpp_content = np.array( subcp.create_kernel(self.data.tolist(), self.metric, self.num_neighbors)) val = self.cpp_content[0] row = list(map(lambda arg: int(arg), self.cpp_content[1])) col = list(map(lambda arg: int(arg), self.cpp_content[2])) self.sijs = np.zeros((n, n)) self.sijs[row, col] = val self.cpp_sijs = self.sijs.tolist( ) #break numpy ndarray to native list of list datastructure if type(self.cpp_sijs[0]) == int or type( self.cpp_sijs[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(self.cpp_sijs) self.cpp_sijs = l # print("self.n: ", self.n) # print("self.f_name: ", self.f_name) # print("self.clusters: ", self.clusters) # print("self.cpp_sijs: ", self.cpp_sijs) # print("self.lambdaVal: ", self.lambdaVal) self.cpp_obj = Clustered(self.n, self.f_name, self.clusters, self.cpp_sijs, self.lambdaVal) else: self.clusters, self.cluster_sijs, self.cluster_map = create_cluster_kernels( self.data.tolist(), self.metric, self.cluster_lab, self.num_clusters) l_temp = [] #TODO: this for loop can be optimized for el in self.cluster_sijs: temp = el.tolist() if type(temp[0]) == int or type( temp[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(temp) temp = l l_temp.append(temp) self.cluster_sijs = l_temp # print("self.n: ", self.n) # print("self.f_name: ", self.f_name) # print("self.clusters: ", self.clusters) # print("self.cluster_sijs: ", self.cluster_sijs) # print("self.cluster_map: ", self.cluster_map) # print("self.lambdaVal: ", self.lambdaVal) self.cpp_obj = Clustered(self.n, self.f_name, self.clusters, self.cluster_sijs, self.cluster_map, lambdaVal) self.effective_ground = self.cpp_obj.getEffectiveGroundSet()
def __init__(self, n, mode, sijs=None, data=None, metric="cosine", num_neighbors=None): self.n = n self.mode = mode self.metric = metric self.sijs = sijs self.data = data self.num_neighbors = num_neighbors self.cpp_obj = None self.cpp_sijs = None self.cpp_content = None self.effective_ground = None if self.n <= 0: raise Exception( "ERROR: Number of elements in ground set must be positive") if self.mode not in ['dense', 'sparse']: raise Exception( "ERROR: Incorrect mode. Must be one of 'dense' or 'sparse'") # if self.metric not in ['euclidean', 'cosine']: # raise Exception("ERROR: Unsupported metric. Must be 'euclidean' or 'cosine'") if type(self.sijs) != type( None): # User has provided similarity kernel if type(self.sijs) == scipy.sparse.csr.csr_matrix: if num_neighbors is None or num_neighbors <= 0: raise Exception( "ERROR: Positive num_neighbors must be provided for given sparse kernel" ) if mode != "sparse": raise Exception( "ERROR: Sparse kernel provided, but mode is not sparse" ) elif type(self.sijs) == np.ndarray: if mode != "dense": raise Exception( "ERROR: Dense kernel provided, but mode is not dense") else: raise Exception("Invalid kernel provided") #TODO: is the below dimensionality check valid for both dense and sparse kernels? if np.shape(self.sijs)[0] != self.n or np.shape( self.sijs)[1] != self.n: raise Exception( "ERROR: Inconsistentcy between n and dimensionality of given similarity kernel" ) if type(self.data) != type(None): print( "WARNING: similarity kernel found. Provided data matrix will be ignored." ) else: #similarity kernel has not been provided if type(self.data) != type(None): if np.shape(self.data)[0] != self.n: raise Exception( "ERROR: Inconsistentcy between n and no of examples in the given data matrix" ) if self.mode == "dense": if self.num_neighbors is not None: raise Exception( "num_neighbors wrongly provided for dense mode") self.num_neighbors = np.shape( self.data )[0] #Using all data as num_neighbors in case of dense mode self.cpp_content = np.array( subcp.create_kernel(self.data.tolist(), self.metric, self.num_neighbors)) val = self.cpp_content[0] row = list(self.cpp_content[1].astype(int)) col = list(self.cpp_content[2].astype(int)) if self.mode == "dense": self.sijs = np.zeros((n, n)) self.sijs[row, col] = val if self.mode == "sparse": self.sijs = sparse.csr_matrix((val, (row, col)), [n, n]) else: raise Exception( "ERROR: Neither ground set data matrix nor similarity kernel provided" ) cpp_ground_sub = { -1 } #Provide a dummy set for pybind11 binding to be successful #Breaking similarity matrix to simpler native data structures for implicit pybind11 binding if self.mode == "dense": self.cpp_sijs = self.sijs.tolist( ) #break numpy ndarray to native list of list datastructure if type(self.cpp_sijs[0]) == int or type( self.cpp_sijs[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(self.cpp_sijs) self.cpp_sijs = l self.cpp_obj = DisparityMin(self.n, self.cpp_sijs, False, cpp_ground_sub) if self.mode == "sparse": #break scipy sparse matrix to native component lists (for csr implementation) self.cpp_sijs = {} self.cpp_sijs['arr_val'] = self.sijs.data.tolist( ) #contains non-zero values in matrix (row major traversal) self.cpp_sijs['arr_count'] = self.sijs.indptr.tolist( ) #cumulitive count of non-zero elements upto but not including current row self.cpp_sijs['arr_col'] = self.sijs.indices.tolist( ) #contains col index corrosponding to non-zero values in arr_val self.cpp_obj = DisparityMin(self.n, self.cpp_sijs['arr_val'], self.cpp_sijs['arr_count'], self.cpp_sijs['arr_col']) self.effective_ground = self.cpp_obj.getEffectiveGroundSet()
def __init__(self, n, mode, separate_rep=None, n_rep=None, sijs=None, data=None, data_rep=None, num_clusters=None, cluster_labels=None, metric="cosine", num_neighbors=None, create_dense_cpp_kernel_in_python=True, pybind_mode="array"): self.n = n self.n_rep = n_rep self.mode = mode self.metric = metric self.sijs = sijs self.data = data self.data_rep = data_rep self.num_neighbors = num_neighbors #self.partial = partial #self.ground_sub = ground_sub self.separate_rep = separate_rep self.clusters = None self.cluster_sijs = None self.cluster_map = None self.cluster_labels = cluster_labels self.num_clusters = num_clusters self.cpp_obj = None self.cpp_sijs = None self.cpp_ground_sub = None self.cpp_content = None self.effective_ground = None if self.n <= 0: raise Exception( "ERROR: Number of elements in ground set must be positive") # if self.partial==True: # if type(self.ground_sub) == type(None) or len(self.ground_sub) == 0: # raise Exception("ERROR: Restricted subset of ground set not specified or empty for partial mode") # if self.mode == "clustered" or mode == "sparse": # raise Exception("clustered or sparse mode not supported if partial = True") # if not all(ele >= 0 and ele <= self.n-1 for ele in self.ground_sub): # raise Exception("Restricted subset of ground set contains invalid values") # if self.separate_rep == True: # raise Exception("Partial not supported if separate_rep = True") if self.mode not in ['dense', 'sparse', 'clustered']: raise Exception( "ERROR: Incorrect mode. Must be one of 'dense', 'sparse' or 'clustered'" ) # if self.metric not in ['euclidean', 'cosine']: # raise Exception("ERROR: Unsupported metric. Must be 'euclidean' or 'cosine'") if self.separate_rep == True: if self.n_rep is None or self.n_rep <= 0: raise Exception( "ERROR: separate represented intended but number of elements in represented not specified or not positive" ) if self.mode != "dense": raise Exception( "Only dense mode supported if separate_rep = True") if self.mode == "clustered": if type(self.cluster_labels) != type(None) and ( self.num_clusters is None or self.num_clusters <= 0): raise Exception( "ERROR: Positive number of clusters must be provided in clustered mode when cluster_labels is provided" ) # if self.cluster_labels is None or len(cluster_labels) != self.n: # raise Exception("ERROR: Cluster ID/label for each element in the ground set is needed") if type(self.cluster_labels) == type( None ) and self.num_clusters is not None and self.num_clusters <= 0: raise Exception("Invalid number of clusters provided") if type(self.cluster_labels) != type(None) and len( self.cluster_labels) != self.n: raise Exception( "ERROR: cluster_labels's size is NOT same as ground set size" ) if type(self.cluster_labels) != type(None) and not all( ele >= 0 and ele <= self.num_clusters - 1 for ele in self.cluster_labels): raise Exception("Cluster IDs/labels contain invalid values") if type(self.sijs) != type( None): # User has provided similarity kernel if create_dense_cpp_kernel_in_python == False: raise Exception( "ERROR: create_dense_cpp_kernel_in_python is to be set to False ONLY when a similarity kernel is not provided and a CPP kernel is desired to be created in CPP" ) if type(self.sijs) == scipy.sparse.csr.csr_matrix: if num_neighbors is None or num_neighbors <= 0: raise Exception( "ERROR: Positive num_neighbors must be provided for given sparse kernel" ) if mode != "sparse": raise Exception( "ERROR: Sparse kernel provided, but mode is not sparse" ) elif type(self.sijs) == np.ndarray: if self.separate_rep is None: raise Exception( "ERROR: separate_rep bool must be specified with custom dense kernel" ) if mode != "dense": raise Exception( "ERROR: Dense kernel provided, but mode is not dense") else: raise Exception("Invalid kernel provided") #TODO: is the below dimensionality check valid for both dense and sparse kernels? if self.separate_rep == True: if np.shape(self.sijs)[1] != self.n or np.shape( self.sijs)[0] != self.n_rep: raise Exception( "ERROR: Inconsistency between n_rep, n and no of rows, columns of given kernel" ) else: if np.shape(self.sijs)[0] != self.n or np.shape( self.sijs)[1] != self.n: raise Exception( "ERROR: Inconsistentcy between n and dimensionality of given similarity kernel" ) if type(self.data) != type(None) or type( self.data_rep) != type(None): print( "WARNING: similarity kernel found. Provided data matrix will be ignored." ) else: #similarity kernel has not been provided if type(self.data) != type(None): if self.separate_rep == True: if type(self.data_rep) == type(None): raise Exception("Represented data matrix not given") if np.shape(self.data)[0] != self.n or np.shape( self.data_rep)[0] != self.n_rep: raise Exception( "ERROR: Inconsistentcy between n, n_rep and no of examples in the given ground data matrix and represented data matrix" ) else: if type(self.data_rep) != type(None): print( "WARNING: Represented data matrix not required but given, will be ignored." ) if np.shape(self.data)[0] != self.n: raise Exception( "ERROR: Inconsistentcy between n and no of examples in the given data matrix" ) if self.mode == "clustered": self.clusters, self.cluster_sijs, self.cluster_map = create_cluster_kernels( self.data.tolist(), self.metric, self.cluster_labels, self.num_clusters) #creates clusters if not provided else: if self.separate_rep == True: #mode in this case will always be dense if create_dense_cpp_kernel_in_python == True: self.sijs = np.array( subcp.create_kernel_NS(self.data.tolist(), self.data_rep.tolist(), self.metric)) else: if self.mode == "dense": if self.num_neighbors is not None: raise Exception( "num_neighbors wrongly provided for dense mode" ) #self.num_neighbors = np.shape(self.data)[0] #Using all data as num_neighbors in case of dense mode if create_dense_cpp_kernel_in_python == True: self.sijs = np.array( subcp.create_square_kernel_dense( self.data.tolist(), self.metric)) else: self.cpp_content = np.array( subcp.create_kernel(self.data.tolist(), self.metric, self.num_neighbors)) val = self.cpp_content[0] #TODO: these two lambdas take quite a bit of time, worth optimizing #row = list(map(lambda arg: int(arg), self.cpp_content[1])) #col = list(map(lambda arg: int(arg), self.cpp_content[2])) # row = [int(x) for x in self.cpp_content[1]] # col = [int(x) for x in self.cpp_content[2]] row = list(self.cpp_content[1].astype(int)) col = list(self.cpp_content[2].astype(int)) self.sijs = sparse.csr_matrix((val, (row, col)), [n, n]) else: raise Exception( "ERROR: Neither ground set data matrix nor similarity kernel provided" ) # if self.partial==None: # self.partial = False if separate_rep == None: self.separate_rep = False # if self.partial==False: self.cpp_ground_sub = { -1 } #Provide a dummy set for pybind11 binding to be successful #Breaking similarity matrix to simpler native data structures for implicit pybind11 binding if self.mode == "dense" and create_dense_cpp_kernel_in_python == True: if pybind_mode == "list": self.cpp_sijs = self.sijs.tolist( ) #break numpy ndarray to native list of list datastructure if type(self.cpp_sijs[0]) == int or type( self.cpp_sijs[0] ) == float: #Its critical that we pass a list of list to pybind11 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix) l = [] l.append(self.cpp_sijs) self.cpp_sijs = l self.cpp_obj = FacilityLocation(self.n, self.cpp_sijs, False, self.cpp_ground_sub, self.separate_rep) # elif pybind_mode == "memoryview": # self.cpp_obj = FacilityLocation(self.n, memoryview(self.sijs), False, self.cpp_ground_sub, self.separate_rep) elif pybind_mode == "numpyarray": self.cpp_obj = FacilityLocation(self.n, self.sijs, False, self.cpp_ground_sub, self.separate_rep) elif pybind_mode == "array32": # print("Kernel's type = ", self.sijs.dtype) self.sijs.astype('float32', copy=False) #self.cpp_obj = FacilityLocation2(self.n, self.sijs, False, {-1}, False); self.cpp_obj = FacilityLocation2() self.cpp_obj.pybind_init(self.n, self.sijs, False, {-1}, self.separate_rep) elif pybind_mode == "array64": # print("Kernel's type = ", self.sijs.dtype) self.sijs.astype('float64', copy=False) #self.cpp_obj = FacilityLocation2(self.n, self.sijs, False, {-1}, False); self.cpp_obj = FacilityLocation2() self.cpp_obj.pybind_init(self.n, self.sijs, False, {-1}, self.separate_rep) elif pybind_mode == "array": #self.cpp_obj = FacilityLocation2(self.n, self.sijs, False, {-1}, False); self.cpp_obj = FacilityLocation2() self.cpp_obj.pybind_init(self.n, self.sijs, False, {-1}, self.separate_rep) else: raise Exception("Invalid pybind mode!") elif self.mode == "dense" and create_dense_cpp_kernel_in_python == False: if self.separate_rep == True: self.cpp_obj = FacilityLocation(self.n, self.data.tolist(), self.data_rep.tolist(), True, self.metric) else: self.cpp_obj = FacilityLocation(self.n, self.data.tolist(), [[0.]], False, self.metric) elif self.mode == "sparse": #break scipy sparse matrix to native component lists (for csr implementation) self.cpp_sijs = {} self.cpp_sijs['arr_val'] = self.sijs.data.tolist( ) #contains non-zero values in matrix (row major traversal) self.cpp_sijs['arr_count'] = self.sijs.indptr.tolist( ) #cumulitive count of non-zero elements upto but not including current row self.cpp_sijs['arr_col'] = self.sijs.indices.tolist( ) #contains col index corrosponding to non-zero values in arr_val self.cpp_obj = FacilityLocation(self.n, self.cpp_sijs['arr_val'], self.cpp_sijs['arr_count'], self.cpp_sijs['arr_col']) elif self.mode == "clustered": l_temp = [] #TODO: this for loop can be optimized for el in self.cluster_sijs: temp = el.tolist() if type(temp[0]) == int or type(temp[0]) == float: l = [] l.append(temp) temp = l l_temp.append(temp) self.cluster_sijs = l_temp self.cpp_obj = FacilityLocation(self.n, self.clusters, self.cluster_sijs, self.cluster_map) #self.cpp_ground_sub=self.cpp_obj.getEffectiveGroundSet() #self.ground_sub=self.cpp_ground_sub self.effective_ground = self.cpp_obj.getEffectiveGroundSet()