Beispiel #1
0
def cpp_sparse_kernel():
    content = np.array(subcp.create_kernel(dataArray.tolist(), "euclidean", num_neighbors))
    val = content[0]
    row = list(map(lambda arg: int(arg), content[1]))
    col = list(map(lambda arg: int(arg), content[2]))
    sijs = sparse.csr_matrix((val, (row, col)), [num_samples,num_samples])
    return sijs
Beispiel #2
0
def cpp_dense_kernel():
    content = np.array(subcp.create_kernel(dataArray.tolist(), "euclidean", np.shape(dataArray)[0]))
    val = content[0]
    row = list(map(lambda arg: int(arg), content[1]))
    col = list(map(lambda arg: int(arg), content[2]))
    sijs = np.zeros((num_samples,num_samples))
    sijs[row,col] = val
    return sijs
Beispiel #3
0
 def test_cosine_neigh3(self, data, val):
     c = subcp.create_kernel(data.tolist(),'cosine',3)
     value = c[0]
     row = list(map(lambda arg: int(arg), c[1]))
     col = list(map(lambda arg: int(arg), c[2]))
     s = np.zeros((np.shape(data)[0],np.shape(data)[0]))
     s[row, col] = value
     assert np.allclose(s, val)
Beispiel #4
0
 def test_cosine_full(self, data):
     CS = cosine_similarity(data)  #sklearn ground truth 
     num_neigh=np.shape(data)[0]
     c = subcp.create_kernel(data.tolist(),'cosine',num_neigh)
     value = c[0]
     row = list(map(lambda arg: int(arg), c[1]))
     col = list(map(lambda arg: int(arg), c[2]))
     s = np.zeros((np.shape(data)[0],np.shape(data)[0]))
     s[row, col] = value
     assert np.allclose(s, CS)
Beispiel #5
0
 def test_euclidean_full(self, data):
     ED = euclidean_distances(data) 
     gamma = 1/np.shape(data)[1] 
     ES = np.exp(-ED* gamma) #sklearn ground truth 
     num_neigh=np.shape(data)[0]
     c = subcp.create_kernel(data.tolist(),'euclidean',num_neigh)
     value = c[0]
     row = list(map(lambda arg: int(arg), c[1]))
     col = list(map(lambda arg: int(arg), c[2]))
     s = np.zeros((np.shape(data)[0],np.shape(data)[0]))
     s[row, col] = value
     assert np.allclose(s, ES)
Beispiel #6
0
    def __init__(self,
                 n,
                 mode,
                 lambdaVal,
                 separate_rep=None,
                 n_rep=None,
                 mgsijs=None,
                 ggsijs=None,
                 data=None,
                 data_rep=None,
                 metric="cosine",
                 num_neighbors=None):
        self.n = n
        self.mode = mode
        self.lambdaVal = lambdaVal
        self.separate_rep = separate_rep
        self.n_rep = n_rep
        self.mgsijs = mgsijs
        self.ggsijs = ggsijs
        self.data = data
        self.data_rep = data_rep
        self.metric = metric
        self.num_neighbors = num_neighbors

        self.clusters = None
        self.cluster_sijs = None
        self.cluster_map = None

        self.cpp_obj = None
        self.cpp_ggsijs = None
        self.cpp_mgsijs = None
        self.cpp_ground_sub = {
            -1
        }  #Provide a dummy set for pybind11 binding to be successful
        self.cpp_content = None
        self.effective_ground = None

        if self.n <= 0:
            raise Exception(
                "ERROR: Number of elements in ground set must be positive")

        if self.mode not in ['dense', 'sparse']:
            raise Exception(
                "ERROR: Incorrect mode. Must be one of 'dense' or 'sparse'")

        # if self.metric not in ['euclidean', 'cosine']:
        # 	raise Exception("ERROR: Unsupported metric. Must be 'euclidean' or 'cosine'")

        if self.separate_rep == True:
            if self.n_rep is None or self.n_rep <= 0:
                raise Exception(
                    "ERROR: separate represented intended but number of elements in represented not specified or not positive"
                )
            if self.mode != "dense":
                raise Exception(
                    "Only dense mode supported if separate_rep = True")
            if (type(self.mgsijs) != type(None)) and (type(self.mgsijs) !=
                                                      np.ndarray):
                raise Exception("mgsijs provided, but is not dense")
            if (type(self.ggsijs) != type(None)) and (type(self.ggsijs) !=
                                                      np.ndarray):
                raise Exception("ggsijs provided, but is not dense")

        if mode == "dense":
            if self.separate_rep == True:
                if type(self.mgsijs) == type(None):
                    #not provided mgsij - make it
                    if (type(data) == type(None)) or (type(data_rep)
                                                      == type(None)):
                        raise Exception("Data missing to compute mgsijs")
                    if np.shape(self.data)[0] != self.n or np.shape(
                            self.data_rep)[0] != self.n_rep:
                        raise Exception(
                            "ERROR: Inconsistentcy between n, n_rep and no of examples in the given ground data matrix and represented data matrix"
                        )
                    self.mgsijs = np.array(
                        subcp.create_kernel_NS(self.data.tolist(),
                                               self.data_rep.tolist(),
                                               self.metric))
                else:
                    #provided mgsijs - verify it's dimensionality
                    if np.shape(self.mgsijs)[1] != self.n or np.shape(
                            self.mgsijs)[0] != self.n_rep:
                        raise Exception(
                            "ERROR: Inconsistency between n_rep, n and no of rows, columns of given mg kernel"
                        )

                if type(self.ggsijs) == type(None):
                    #not provided ggsijs - make it
                    if type(data) == type(None):
                        raise Exception("Data missing to compute ggsijs")
                    if self.num_neighbors is not None:
                        raise Exception(
                            "num_neighbors wrongly provided for dense mode")
                    self.num_neighbors = np.shape(
                        self.data
                    )[0]  #Using all data as num_neighbors in case of dense mode
                    self.cpp_content = np.array(
                        subcp.create_kernel(self.data.tolist(), self.metric,
                                            self.num_neighbors))
                    val = self.cpp_content[0]
                    row = list(self.cpp_content[1].astype(int))
                    col = list(self.cpp_content[2].astype(int))
                    self.ggsijs = np.zeros((n, n))
                    self.ggsijs[row, col] = val
                else:
                    #provided ggsijs - verify it's dimensionality
                    if np.shape(self.ggsijs)[0] != self.n or np.shape(
                            self.ggsijs)[1] != self.n:
                        raise Exception(
                            "ERROR: Inconsistentcy between n and dimensionality of given similarity gg kernel"
                        )

            else:
                if (type(self.ggsijs) == type(None)) and (type(self.mgsijs)
                                                          == type(None)):
                    #no kernel is provided make ggsij kernel
                    if type(data) == type(None):
                        raise Exception("Data missing to compute ggsijs")
                    if self.num_neighbors is not None:
                        raise Exception(
                            "num_neighbors wrongly provided for dense mode")
                    self.num_neighbors = np.shape(
                        self.data
                    )[0]  #Using all data as num_neighbors in case of dense mode
                    self.cpp_content = np.array(
                        subcp.create_kernel(self.data.tolist(), self.metric,
                                            self.num_neighbors))
                    val = self.cpp_content[0]
                    row = list(self.cpp_content[1].astype(int))
                    col = list(self.cpp_content[2].astype(int))
                    self.ggsijs = np.zeros((n, n))
                    self.ggsijs[row, col] = val
                elif (type(self.ggsijs)
                      == type(None)) and (type(self.mgsijs) != type(None)):
                    #gg is not available, mg is - good
                    #verify that it is dense and of correct dimension
                    if (type(self.mgsijs) != np.ndarray) or np.shape(
                            self.mgsijs)[1] != self.n or np.shape(
                                self.mgsijs)[0] != self.n:
                        raise Exception(
                            "ERROR: Inconsistency between n and no of rows, columns of given kernel"
                        )
                    self.ggsijs = self.mgsijs
                elif (type(self.ggsijs) != type(None)) and (type(self.mgsijs)
                                                            == type(None)):
                    #gg is available, mg is not - good
                    #verify that it is dense and of correct dimension
                    if (type(self.ggsijs) != np.ndarray) or np.shape(
                            self.ggsijs)[1] != self.n or np.shape(
                                self.ggsijs)[0] != self.n:
                        raise Exception(
                            "ERROR: Inconsistency between n and no of rows, columns of given kernel"
                        )
                else:
                    #both are available - something is wrong
                    raise Exception(
                        "Two kernels have been wrongly provided when separate_rep=False"
                    )
        elif mode == "sparse":
            if self.separate_rep == True:
                raise Exception(
                    "Separate represented is supported only in dense mode")
            if self.num_neighbors is None or self.num_neighbors <= 0:
                raise Exception(
                    "Valid num_neighbors is needed for sparse mode")
            if (type(self.ggsijs) == type(None)) and (type(self.mgsijs)
                                                      == type(None)):
                #no kernel is provided make ggsij sparse kernel
                if type(data) == type(None):
                    raise Exception("Data missing to compute ggsijs")
                self.cpp_content = np.array(
                    subcp.create_kernel(self.data.tolist(), self.metric,
                                        self.num_neighbors))
                val = self.cpp_content[0]
                row = list(self.cpp_content[1].astype(int))
                col = list(self.cpp_content[2].astype(int))
                self.ggsijs = sparse.csr_matrix((val, (row, col)), [n, n])
            elif (type(self.ggsijs)
                  == type(None)) and (type(self.mgsijs) != type(None)):
                #gg is not available, mg is - good
                #verify that it is sparse
                if type(self.mgsijs) != scipy.sparse.csr.csr_matrix:
                    raise Exception("Provided kernel is not sparse")
                self.ggsijs = self.mgsijs
            elif (type(self.ggsijs) != type(None)) and (type(self.mgsijs)
                                                        == type(None)):
                #gg is available, mg is not - good
                #verify that it is dense and of correct dimension
                if type(self.ggsijs) != scipy.sparse.csr.csr_matrix:
                    raise Exception("Provided kernel is not sparse")
            else:
                #both are available - something is wrong
                raise Exception(
                    "Two kernels have been wrongly provided when separate_rep=False"
                )

        if self.separate_rep == None:
            self.separate_rep = False

        if self.mode == "dense" and self.separate_rep == False:
            self.cpp_ggsijs = self.ggsijs.tolist(
            )  #break numpy ndarray to native list of list datastructure

            if type(self.cpp_ggsijs[0]) == int or type(
                    self.cpp_ggsijs[0]
            ) == float:  #Its critical that we pass a list of list to pybind11
                #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
                l = []
                l.append(self.cpp_ggsijs)
                self.cpp_ggsijs = l

            self.cpp_obj = GraphCut(self.n, self.cpp_ggsijs, False,
                                    self.cpp_ground_sub, self.lambdaVal)

        elif self.mode == "dense" and self.separate_rep == True:
            self.cpp_ggsijs = self.ggsijs.tolist(
            )  #break numpy ndarray to native list of list datastructure

            if type(self.cpp_ggsijs[0]) == int or type(
                    self.cpp_ggsijs[0]
            ) == float:  #Its critical that we pass a list of list to pybind11
                #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
                l = []
                l.append(self.cpp_ggsijs)
                self.cpp_ggsijs = l

            self.cpp_mgsijs = self.mgsijs.tolist(
            )  #break numpy ndarray to native list of list datastructure

            if type(self.cpp_mgsijs[0]) == int or type(
                    self.cpp_mgsijs[0]
            ) == float:  #Its critical that we pass a list of list to pybind11
                #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
                l = []
                l.append(self.cpp_mgsijs)
                self.cpp_mgsijs = l

            self.cpp_obj = GraphCut(self.n, self.cpp_mgsijs, self.cpp_ggsijs,
                                    self.lambdaVal)

        elif self.mode == "sparse":
            self.cpp_ggsijs = {}
            self.cpp_ggsijs['arr_val'] = self.ggsijs.data.tolist(
            )  #contains non-zero values in matrix (row major traversal)
            self.cpp_ggsijs['arr_count'] = self.ggsijs.indptr.tolist(
            )  #cumulitive count of non-zero elements upto but not including current row
            self.cpp_ggsijs['arr_col'] = self.ggsijs.indices.tolist(
            )  #contains col index corrosponding to non-zero values in arr_val
            self.cpp_obj = GraphCut(self.n, self.cpp_ggsijs['arr_val'],
                                    self.cpp_ggsijs['arr_count'],
                                    self.cpp_ggsijs['arr_col'], lambdaVal)
        else:
            raise Exception("Invalid")

        self.effective_ground = self.cpp_obj.getEffectiveGroundSet()
Beispiel #7
0
    def __init__(self,
                 n,
                 num_privates,
                 lambdaVal,
                 data_sijs=None,
                 private_sijs=None,
                 private_private_sijs=None,
                 data=None,
                 privateData=None,
                 metric="cosine",
                 privacyHardness=1):
        self.n = n
        self.num_privates = num_privates
        self.lambdaVal = lambdaVal
        self.metric = metric
        self.privacyHardness = privacyHardness
        self.data_sijs = data_sijs
        self.private_sijs = private_sijs
        self.private_private_sijs = private_private_sijs
        self.data = data
        self.privateData = privateData

        self.cpp_obj = None
        self.cpp_data_sijs = None
        self.cpp_private_sijs = None
        self.cpp_private_private_sijs = None
        self.cpp_content = None
        self.cpp_content2 = None
        self.effective_ground = None

        if self.n <= 0:
            raise Exception(
                "ERROR: Number of elements in ground set must be positive")

        if self.num_privates < 0:
            raise Exception("ERROR: Number of queries must be >= 0")

        # if self.metric not in ['euclidean', 'cosine']:
        # 	raise Exception("ERROR: Unsupported metric. Must be 'euclidean' or 'cosine'")

        if (type(self.data_sijs) != type(None)) and (type(
                self.private_sijs) != type(None)) and (
                    type(self.private_private_sijs) !=
                    type(None)):  # User has provided all three kernels
            if type(self.data_sijs) != np.ndarray:
                raise Exception(
                    "Invalid data kernel type provided, must be ndarray")
            if type(self.private_sijs) != np.ndarray:
                raise Exception(
                    "Invalid private kernel type provided, must be ndarray")
            if type(self.private_private_sijs) != np.ndarray:
                raise Exception(
                    "Invalid private-private kernel type provided, must be ndarray"
                )
            if np.shape(self.data_sijs)[0] != self.n or np.shape(
                    self.data_sijs)[1] != self.n:
                raise Exception("ERROR: data kernel should be n X n")
            if np.shape(self.private_sijs)[0] != self.n or np.shape(
                    self.private_sijs)[1] != self.num_privates:
                raise Exception(
                    "ERROR: Private Kernel should be n X num_privates")
            if np.shape(self.private_private_sijs
                        )[0] != self.num_privates or np.shape(
                            self.private_private_sijs)[1] != self.num_privates:
                raise Exception(
                    "ERROR: Private-private Kernel should be num_privates X num_privates"
                )
            if (type(self.data) != type(None)) or (type(self.privateData) !=
                                                   type(None)):
                print(
                    "WARNING: similarity kernels found. Provided data and private matrices will be ignored."
                )
        else:  #similarity kernels have not been provided
            if (type(self.data) == type(None)) or (type(self.privateData)
                                                   == type(None)):
                raise Exception(
                    "Since kernels are not provided, data matrices are a must")
            if np.shape(self.data)[0] != self.n:
                raise Exception(
                    "ERROR: Inconsistentcy between n and no of examples in the given data matrix"
                )
            if np.shape(self.privateData)[0] != self.num_privates:
                raise Exception(
                    "ERROR: Inconsistentcy between num_privates and no of examples in the given private data matrix"
                )

            #construct imageKernel
            self.num_neighbors = self.n  #Using all data as num_neighbors in case of dense mode
            self.cpp_content = np.array(
                subcp.create_kernel(self.data.tolist(), self.metric,
                                    self.num_neighbors))
            val = self.cpp_content[0]
            row = list(self.cpp_content[1].astype(int))
            col = list(self.cpp_content[2].astype(int))
            self.data_sijs = np.zeros((self.n, self.n))
            self.data_sijs[row, col] = val

            #construct privateKernel
            self.private_sijs = np.array(
                subcp.create_kernel_NS(self.privateData.tolist(),
                                       self.data.tolist(), self.metric))

            #construct privatePrivateKernel
            self.num_neighbors2 = self.num_privates  #Using all data as num_neighbors in case of dense mode
            self.cpp_content2 = np.array(
                subcp.create_kernel(self.privateData.tolist(), self.metric,
                                    self.num_neighbors2))
            val2 = self.cpp_content2[0]
            row2 = list(self.cpp_content2[1].astype(int))
            col2 = list(self.cpp_content2[2].astype(int))
            self.private_private_sijs = np.zeros(
                (self.num_privates, self.num_privates))
            self.private_private_sijs[row2, col2] = val2

        #Breaking similarity matrix to simpler native data structures for implicit pybind11 binding
        self.cpp_data_sijs = self.data_sijs.tolist(
        )  #break numpy ndarray to native list of list datastructure

        if type(self.cpp_data_sijs[0]) == int or type(
                self.cpp_data_sijs[0]
        ) == float:  #Its critical that we pass a list of list to pybind11
            #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
            l = []
            l.append(self.cpp_data_sijs)
            self.cpp_data_sijs = l

        self.cpp_private_sijs = self.private_sijs.tolist(
        )  #break numpy ndarray to native list of list datastructure

        if type(self.cpp_private_sijs[0]) == int or type(
                self.cpp_private_sijs[0]
        ) == float:  #Its critical that we pass a list of list to pybind11
            #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
            l = []
            l.append(self.cpp_private_sijs)
            self.cpp_private_sijs = l

        #Breaking similarity matrix to simpler native data structures for implicit pybind11 binding
        self.cpp_private_private_sijs = self.private_private_sijs.tolist(
        )  #break numpy ndarray to native list of list datastructure

        if type(self.cpp_private_private_sijs[0]) == int or type(
                self.cpp_private_private_sijs[0]
        ) == float:  #Its critical that we pass a list of list to pybind11
            #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
            l = []
            l.append(self.cpp_private_private_sijs)
            self.cpp_private_private_sijs = l

        self.cpp_obj = LogDeterminantConditionalGain(
            self.n, self.num_privates, self.cpp_data_sijs,
            self.cpp_private_sijs, self.cpp_private_private_sijs,
            self.lambdaVal, self.privacyHardness)
        self.effective_ground = set(range(n))
Beispiel #8
0
	def __init__(self, n, n_master=-1, sijs=None, data=None, data_master=None, cluster_lab=None, mode=None, metric="cosine", num_neigh=-1, num_cluster=None, partial=False, ground_sub=None):
		self.n = n
		self.n_master = n_master
		self.mode = mode
		self.metric = metric
		self.sijs = sijs
		self.data = data
		self.data_master=data_master
		self.num_neigh = num_neigh
		self.partial = partial
		self.ground_sub = ground_sub
		self.seperateMaster=False
		self.clusters=None
		self.cluster_sijs=None
		self.cluster_map=None
		self.cluster_lab=cluster_lab
		self.num_cluster=num_cluster
		self.cpp_obj = None
		self.cpp_sijs = None
		self.cpp_ground_sub = ground_sub
		self.cpp_content = None

		if self.n==0:
			raise Exception("ERROR: Number of elements in ground set can't be 0")

		if self.partial==True and self.ground_sub==None:
			raise Exception("ERROR: Ground subset not specified")
		
		if mode!=None and mode not in ['dense', 'sparse', 'clustered']:
			raise Exception("ERROR: Incorrect mode")
		
		if metric not in ['euclidean', 'cosine']:
			raise Exception("ERROR: Unsupported metric")

		if type(self.sijs)!=type(None): # User has provided sim matrix directly: simply consume it
			if np.shape(self.sijs)[0]!=self.n:
				raise Exception("ERROR: Inconsistentcy between n and no of examples in the given similarity matrix")
			
			if type(self.sijs) == scipy.sparse.csr.csr_matrix and num_neigh==-1:
				raise Exception("ERROR: num_neigh for given sparse matrix not provided")
			if self.mode!=None: # Ensure that there is no inconsistency in similarity matrix and provided mode
				if type(self.sijs) == np.ndarray and self.mode!="dense":
					print("WARNING: Incorrect mode provided for given similarity matrix, changing it to dense")
					self.mode="dense"
				if type(self.sijs) == scipy.sparse.csr.csr_matrix and self.mode!="sparse":
					print("WARNING: Incorrect mode provided for given similarity matrix, changing it to sparse")
					self.mode="sparse"
			else: # Infer mode from similarity matrix
				if type(self.sijs) == np.ndarray:
					self.mode="dense"
				if type(self.sijs) == scipy.sparse.csr.csr_matrix:
					self.mode="sparse"
		else:
			if type(self.data)!=type(None): # User has only provided data: build similarity matrix/cluster-info and consume it
				
				if np.shape(self.data)[0]!=self.n:
					raise Exception("ERROR: Inconsistentcy between n and no of examples in the given data matrix")

				if type(self.data_master)!=type(None):
					self.seperateMaster=True
					if np.shape(self.data_master)[0]!=self.n_master:
						raise Exception("ERROR: Inconsistentcy between n_master and no of examples in the given data_master matrix")
					if self.mode=="sparse" or self.mode=="cluster":
						raise Exception("ERROR: mode can't be sparse or cluster if ground and master datasets are different")
					if partial==True:
						raise Exception("ERROR: partial can't be True if ground and master datasets are different")

				if self.mode==None:
					self.mode="sparse"

				if self.num_neigh==-1 and self.seperateMaster==False:
					self.num_neigh=np.shape(self.data)[0] #default is total no of datapoints

				if self.mode=="clustered":
					self.clusters, self.cluster_sijs, self.cluster_map = create_cluster(self.data.tolist(), self.metric, self.cluster_lab, self.num_cluster)
				else:
					if self.seperateMaster==True: #mode in this case will always be dense
						self.sijs = np.array(subcp.create_kernel_NS(self.data.tolist(),self.data_master.tolist(), self.metric))
					else:
						self.cpp_content = np.array(subcp.create_kernel(self.data.tolist(), self.metric, self.num_neigh))
						val = self.cpp_content[0]
						row = list(map(lambda arg: int(arg), self.cpp_content[1]))
						col = list(map(lambda arg: int(arg), self.cpp_content[2]))
						if self.mode=="dense":
							self.sijs = np.zeros((n,n))
							self.sijs[row,col] = val
						if self.mode=="sparse":
							self.sijs = sparse.csr_matrix((val, (row, col)), [n,n])

			else:
				raise Exception("ERROR: Neither data nor similarity matrix provided")
		
		if self.partial==False: 
			self.cpp_ground_sub = {-1} #Provide a dummy set for pybind11 binding to be successful
		
		#Breaking similarity matrix to simpler native data sturctures for implicit pybind11 binding
		if self.mode=="dense":
			self.cpp_sijs = self.sijs.tolist() #break numpy ndarray to native list of list datastructure
			
			if type(self.cpp_sijs[0])==int or type(self.cpp_sijs[0])==float: #Its critical that we pass a list of list to pybind11
																			 #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
				l=[]
				l.append(self.cpp_sijs)
				self.cpp_sijs=l
			if np.shape(self.cpp_sijs)[0]!=np.shape(self.cpp_sijs)[1] and self.seperateMaster==False: #TODO: relocate this check to some earlier part of code
				raise Exception("ERROR: Dense similarity matrix should be a square matrix if ground and master datasets are same")

			self.cpp_obj = FacilityLocation(self.n, self.mode, self.cpp_sijs, self.num_neigh, self.partial, self.cpp_ground_sub, self.seperateMaster)
		
		if self.mode=="sparse": #break scipy sparse matrix to native component lists (for csr implementation)
			self.cpp_sijs = {}
			self.cpp_sijs['arr_val'] = self.sijs.data.tolist() #contains non-zero values in matrix (row major traversal)
			self.cpp_sijs['arr_count'] = self.sijs.indptr.tolist() #cumulitive count of non-zero elements upto but not including current row
			self.cpp_sijs['arr_col'] = self.sijs.indices.tolist() #contains col index corrosponding to non-zero values in arr_val
			self.cpp_obj = FacilityLocation(self.n, self.mode, self.cpp_sijs['arr_val'], self.cpp_sijs['arr_count'], self.cpp_sijs['arr_col'], self.num_neigh, self.partial, self.cpp_ground_sub)
		
		if self.mode=="clustered":
			l_temp = []
			for el in self.cluster_sijs:
				temp=el.tolist()
				if type(temp[0])==int or type(temp[0])==float: 
					l=[]
					l.append(temp)
					temp=l
				l_temp.append(temp)
			self.cluster_sijs = l_temp

			self.cpp_obj = FacilityLocation(self.n, self.mode, self.clusters, self.cluster_sijs, self.cluster_map, self.num_neigh, self.partial, self.cpp_ground_sub)

		self.cpp_ground_sub=self.cpp_obj.getEffectiveGroundSet()
		self.ground_sub=self.cpp_ground_sub
Beispiel #9
0
import submodlib_cpp as subcp
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from time import time

data = np.array([[0, 1, 3], [5, 1, 5], [10, 2, 6], [12, 20, 68]])

num_neigh = 2
t = time()
CS = cosine_similarity(data)
print("sklearn:", time() - t)
print(CS.tolist())

t = time()
c = subcp.create_kernel(data.tolist(), 'cosine', num_neigh)
val = c[0]
row = list(map(lambda arg: int(arg), c[1]))
col = list(map(lambda arg: int(arg), c[2]))
s = np.zeros((4, 4))
s[row, col] = val
print("cpp:", time() - t)  #Atleast 100 times faster
print(s)

#print(np.allclose(CS,s))

t = time()
ED = euclidean_distances(data)
gamma = 1 / np.shape(data)[1]
ES = np.exp(-ED * gamma)  #sklearn ground truth
print("sklearn:", time() - t)
Beispiel #10
0
    def __init__(self,
                 n,
                 f_name,
                 data,
                 mode,
                 cluster_lab=None,
                 num_clusters=None,
                 metric="cosine",
                 lambdaVal=1):
        self.n = n
        self.f_name = f_name
        self.num_clusters = num_clusters
        self.data = data
        self.mode = mode
        self.cluster_lab = cluster_lab
        self.metric = metric

        self.clusters = None
        self.cluster_sijs = None
        self.cluster_map = None
        self.sijs = None
        self.cpp_content = None
        self.cpp_sijs = None
        self.effective_ground = None
        self.lambdaVal = lambdaVal

        if self.n <= 0:
            raise Exception(
                "ERROR: Number of elements in ground set must be positive")

        if self.mode not in ['single', 'multi']:
            raise Exception(
                "ERROR: Incorrect mode. Must be one of 'single' or 'multi'")

        # if self.metric not in ['euclidean', 'cosine']:
        # 	raise Exception("ERROR: Unsupported metric")

        if type(self.cluster_lab) != type(None) and (
                self.num_clusters is None or self.num_clusters <= 0):
            raise Exception(
                "ERROR: Positive number of clusters must be provided when cluster_lab is provided"
            )
        if type(self.cluster_lab) != type(None) and len(
                self.cluster_lab) != self.n:
            raise Exception(
                "ERROR: cluster_lab's size is NOT same as ground set size")
        if type(self.cluster_lab) != type(None) and not all(
                ele >= 0 and ele <= self.num_clusters - 1
                for ele in self.cluster_lab):
            raise Exception("Cluster IDs/labels contain invalid values")

        if np.shape(self.data)[0] != self.n:
            raise Exception(
                "ERROR: Inconsistentcy between n and no of examples in the given ground data matrix"
            )

        if mode == "single":
            self.clusters, _, _ = create_cluster_kernels(self.data.tolist(),
                                                         self.metric,
                                                         self.cluster_lab,
                                                         self.num_clusters,
                                                         onlyClusters=True)
            self.num_neighbors = np.shape(self.data)[
                0]  #Using all data as num_neighbors in case of dense mode
            self.cpp_content = np.array(
                subcp.create_kernel(self.data.tolist(), self.metric,
                                    self.num_neighbors))
            val = self.cpp_content[0]
            row = list(map(lambda arg: int(arg), self.cpp_content[1]))
            col = list(map(lambda arg: int(arg), self.cpp_content[2]))
            self.sijs = np.zeros((n, n))
            self.sijs[row, col] = val
            self.cpp_sijs = self.sijs.tolist(
            )  #break numpy ndarray to native list of list datastructure
            if type(self.cpp_sijs[0]) == int or type(
                    self.cpp_sijs[0]
            ) == float:  #Its critical that we pass a list of list to pybind11
                #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
                l = []
                l.append(self.cpp_sijs)
                self.cpp_sijs = l
            # print("self.n: ", self.n)
            # print("self.f_name: ", self.f_name)
            # print("self.clusters: ", self.clusters)
            # print("self.cpp_sijs: ", self.cpp_sijs)
            # print("self.lambdaVal: ", self.lambdaVal)
            self.cpp_obj = Clustered(self.n, self.f_name, self.clusters,
                                     self.cpp_sijs, self.lambdaVal)
        else:
            self.clusters, self.cluster_sijs, self.cluster_map = create_cluster_kernels(
                self.data.tolist(), self.metric, self.cluster_lab,
                self.num_clusters)
            l_temp = []
            #TODO: this for loop can be optimized
            for el in self.cluster_sijs:
                temp = el.tolist()
                if type(temp[0]) == int or type(
                        temp[0]
                ) == float:  #Its critical that we pass a list of list to pybind11
                    #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
                    l = []
                    l.append(temp)
                    temp = l
                l_temp.append(temp)
            self.cluster_sijs = l_temp
            # print("self.n: ", self.n)
            # print("self.f_name: ", self.f_name)
            # print("self.clusters: ", self.clusters)
            # print("self.cluster_sijs: ", self.cluster_sijs)
            # print("self.cluster_map: ", self.cluster_map)
            # print("self.lambdaVal: ", self.lambdaVal)
            self.cpp_obj = Clustered(self.n, self.f_name, self.clusters,
                                     self.cluster_sijs, self.cluster_map,
                                     lambdaVal)
        self.effective_ground = self.cpp_obj.getEffectiveGroundSet()
Beispiel #11
0
    def __init__(self,
                 n,
                 mode,
                 sijs=None,
                 data=None,
                 metric="cosine",
                 num_neighbors=None):
        self.n = n
        self.mode = mode
        self.metric = metric
        self.sijs = sijs
        self.data = data
        self.num_neighbors = num_neighbors
        self.cpp_obj = None
        self.cpp_sijs = None
        self.cpp_content = None
        self.effective_ground = None

        if self.n <= 0:
            raise Exception(
                "ERROR: Number of elements in ground set must be positive")

        if self.mode not in ['dense', 'sparse']:
            raise Exception(
                "ERROR: Incorrect mode. Must be one of 'dense' or 'sparse'")

        # if self.metric not in ['euclidean', 'cosine']:
        # 	raise Exception("ERROR: Unsupported metric. Must be 'euclidean' or 'cosine'")

        if type(self.sijs) != type(
                None):  # User has provided similarity kernel
            if type(self.sijs) == scipy.sparse.csr.csr_matrix:
                if num_neighbors is None or num_neighbors <= 0:
                    raise Exception(
                        "ERROR: Positive num_neighbors must be provided for given sparse kernel"
                    )
                if mode != "sparse":
                    raise Exception(
                        "ERROR: Sparse kernel provided, but mode is not sparse"
                    )
            elif type(self.sijs) == np.ndarray:
                if mode != "dense":
                    raise Exception(
                        "ERROR: Dense kernel provided, but mode is not dense")
            else:
                raise Exception("Invalid kernel provided")
            #TODO: is the below dimensionality check valid for both dense and sparse kernels?
            if np.shape(self.sijs)[0] != self.n or np.shape(
                    self.sijs)[1] != self.n:
                raise Exception(
                    "ERROR: Inconsistentcy between n and dimensionality of given similarity kernel"
                )
            if type(self.data) != type(None):
                print(
                    "WARNING: similarity kernel found. Provided data matrix will be ignored."
                )
        else:  #similarity kernel has not been provided
            if type(self.data) != type(None):
                if np.shape(self.data)[0] != self.n:
                    raise Exception(
                        "ERROR: Inconsistentcy between n and no of examples in the given data matrix"
                    )

                if self.mode == "dense":
                    if self.num_neighbors is not None:
                        raise Exception(
                            "num_neighbors wrongly provided for dense mode")
                    self.num_neighbors = np.shape(
                        self.data
                    )[0]  #Using all data as num_neighbors in case of dense mode
                self.cpp_content = np.array(
                    subcp.create_kernel(self.data.tolist(), self.metric,
                                        self.num_neighbors))
                val = self.cpp_content[0]
                row = list(self.cpp_content[1].astype(int))
                col = list(self.cpp_content[2].astype(int))
                if self.mode == "dense":
                    self.sijs = np.zeros((n, n))
                    self.sijs[row, col] = val
                if self.mode == "sparse":
                    self.sijs = sparse.csr_matrix((val, (row, col)), [n, n])
            else:
                raise Exception(
                    "ERROR: Neither ground set data matrix nor similarity kernel provided"
                )

        cpp_ground_sub = {
            -1
        }  #Provide a dummy set for pybind11 binding to be successful

        #Breaking similarity matrix to simpler native data structures for implicit pybind11 binding
        if self.mode == "dense":
            self.cpp_sijs = self.sijs.tolist(
            )  #break numpy ndarray to native list of list datastructure

            if type(self.cpp_sijs[0]) == int or type(
                    self.cpp_sijs[0]
            ) == float:  #Its critical that we pass a list of list to pybind11
                #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
                l = []
                l.append(self.cpp_sijs)
                self.cpp_sijs = l

            self.cpp_obj = DisparityMin(self.n, self.cpp_sijs, False,
                                        cpp_ground_sub)

        if self.mode == "sparse":  #break scipy sparse matrix to native component lists (for csr implementation)
            self.cpp_sijs = {}
            self.cpp_sijs['arr_val'] = self.sijs.data.tolist(
            )  #contains non-zero values in matrix (row major traversal)
            self.cpp_sijs['arr_count'] = self.sijs.indptr.tolist(
            )  #cumulitive count of non-zero elements upto but not including current row
            self.cpp_sijs['arr_col'] = self.sijs.indices.tolist(
            )  #contains col index corrosponding to non-zero values in arr_val
            self.cpp_obj = DisparityMin(self.n, self.cpp_sijs['arr_val'],
                                        self.cpp_sijs['arr_count'],
                                        self.cpp_sijs['arr_col'])

        self.effective_ground = self.cpp_obj.getEffectiveGroundSet()
Beispiel #12
0
    def __init__(self,
                 n,
                 mode,
                 separate_rep=None,
                 n_rep=None,
                 sijs=None,
                 data=None,
                 data_rep=None,
                 num_clusters=None,
                 cluster_labels=None,
                 metric="cosine",
                 num_neighbors=None,
                 create_dense_cpp_kernel_in_python=True,
                 pybind_mode="array"):
        self.n = n
        self.n_rep = n_rep
        self.mode = mode
        self.metric = metric
        self.sijs = sijs
        self.data = data
        self.data_rep = data_rep
        self.num_neighbors = num_neighbors
        #self.partial = partial
        #self.ground_sub = ground_sub
        self.separate_rep = separate_rep
        self.clusters = None
        self.cluster_sijs = None
        self.cluster_map = None
        self.cluster_labels = cluster_labels
        self.num_clusters = num_clusters
        self.cpp_obj = None
        self.cpp_sijs = None
        self.cpp_ground_sub = None
        self.cpp_content = None
        self.effective_ground = None

        if self.n <= 0:
            raise Exception(
                "ERROR: Number of elements in ground set must be positive")

        # if self.partial==True:
        # 	if type(self.ground_sub) == type(None) or len(self.ground_sub) == 0:
        # 		raise Exception("ERROR: Restricted subset of ground set not specified or empty for partial mode")
        # 	if self.mode == "clustered" or mode == "sparse":
        # 		raise Exception("clustered or sparse mode not supported if partial = True")
        # 	if not all(ele >= 0 and ele <= self.n-1 for ele in self.ground_sub):
        # 		raise Exception("Restricted subset of ground set contains invalid values")
        # 	if self.separate_rep == True:
        # 		raise Exception("Partial not supported if separate_rep = True")

        if self.mode not in ['dense', 'sparse', 'clustered']:
            raise Exception(
                "ERROR: Incorrect mode. Must be one of 'dense', 'sparse' or 'clustered'"
            )

        # if self.metric not in ['euclidean', 'cosine']:
        # 	raise Exception("ERROR: Unsupported metric. Must be 'euclidean' or 'cosine'")

        if self.separate_rep == True:
            if self.n_rep is None or self.n_rep <= 0:
                raise Exception(
                    "ERROR: separate represented intended but number of elements in represented not specified or not positive"
                )
            if self.mode != "dense":
                raise Exception(
                    "Only dense mode supported if separate_rep = True")

        if self.mode == "clustered":
            if type(self.cluster_labels) != type(None) and (
                    self.num_clusters is None or self.num_clusters <= 0):
                raise Exception(
                    "ERROR: Positive number of clusters must be provided in clustered mode when cluster_labels is provided"
                )
            # if self.cluster_labels  is None or len(cluster_labels) != self.n:
            # 	raise Exception("ERROR: Cluster ID/label for each element in the ground set is needed")
            if type(self.cluster_labels) == type(
                    None
            ) and self.num_clusters is not None and self.num_clusters <= 0:
                raise Exception("Invalid number of clusters provided")
            if type(self.cluster_labels) != type(None) and len(
                    self.cluster_labels) != self.n:
                raise Exception(
                    "ERROR: cluster_labels's size is NOT same as ground set size"
                )
            if type(self.cluster_labels) != type(None) and not all(
                    ele >= 0 and ele <= self.num_clusters - 1
                    for ele in self.cluster_labels):
                raise Exception("Cluster IDs/labels contain invalid values")

        if type(self.sijs) != type(
                None):  # User has provided similarity kernel
            if create_dense_cpp_kernel_in_python == False:
                raise Exception(
                    "ERROR: create_dense_cpp_kernel_in_python is to be set to False ONLY when a similarity kernel is not provided and a CPP kernel is desired to be created in CPP"
                )
            if type(self.sijs) == scipy.sparse.csr.csr_matrix:
                if num_neighbors is None or num_neighbors <= 0:
                    raise Exception(
                        "ERROR: Positive num_neighbors must be provided for given sparse kernel"
                    )
                if mode != "sparse":
                    raise Exception(
                        "ERROR: Sparse kernel provided, but mode is not sparse"
                    )
            elif type(self.sijs) == np.ndarray:
                if self.separate_rep is None:
                    raise Exception(
                        "ERROR: separate_rep bool must be specified with custom dense kernel"
                    )
                if mode != "dense":
                    raise Exception(
                        "ERROR: Dense kernel provided, but mode is not dense")
            else:
                raise Exception("Invalid kernel provided")
            #TODO: is the below dimensionality check valid for both dense and sparse kernels?
            if self.separate_rep == True:
                if np.shape(self.sijs)[1] != self.n or np.shape(
                        self.sijs)[0] != self.n_rep:
                    raise Exception(
                        "ERROR: Inconsistency between n_rep, n and no of rows, columns of given kernel"
                    )
            else:
                if np.shape(self.sijs)[0] != self.n or np.shape(
                        self.sijs)[1] != self.n:
                    raise Exception(
                        "ERROR: Inconsistentcy between n and dimensionality of given similarity kernel"
                    )
            if type(self.data) != type(None) or type(
                    self.data_rep) != type(None):
                print(
                    "WARNING: similarity kernel found. Provided data matrix will be ignored."
                )
        else:  #similarity kernel has not been provided
            if type(self.data) != type(None):
                if self.separate_rep == True:
                    if type(self.data_rep) == type(None):
                        raise Exception("Represented data matrix not given")
                    if np.shape(self.data)[0] != self.n or np.shape(
                            self.data_rep)[0] != self.n_rep:
                        raise Exception(
                            "ERROR: Inconsistentcy between n, n_rep and no of examples in the given ground data matrix and represented data matrix"
                        )
                else:
                    if type(self.data_rep) != type(None):
                        print(
                            "WARNING: Represented data matrix not required but given, will be ignored."
                        )
                    if np.shape(self.data)[0] != self.n:
                        raise Exception(
                            "ERROR: Inconsistentcy between n and no of examples in the given data matrix"
                        )

                if self.mode == "clustered":
                    self.clusters, self.cluster_sijs, self.cluster_map = create_cluster_kernels(
                        self.data.tolist(), self.metric, self.cluster_labels,
                        self.num_clusters)  #creates clusters if not provided
                else:
                    if self.separate_rep == True:  #mode in this case will always be dense
                        if create_dense_cpp_kernel_in_python == True:
                            self.sijs = np.array(
                                subcp.create_kernel_NS(self.data.tolist(),
                                                       self.data_rep.tolist(),
                                                       self.metric))
                    else:
                        if self.mode == "dense":
                            if self.num_neighbors is not None:
                                raise Exception(
                                    "num_neighbors wrongly provided for dense mode"
                                )
                            #self.num_neighbors = np.shape(self.data)[0] #Using all data as num_neighbors in case of dense mode
                            if create_dense_cpp_kernel_in_python == True:
                                self.sijs = np.array(
                                    subcp.create_square_kernel_dense(
                                        self.data.tolist(), self.metric))
                        else:
                            self.cpp_content = np.array(
                                subcp.create_kernel(self.data.tolist(),
                                                    self.metric,
                                                    self.num_neighbors))
                            val = self.cpp_content[0]
                            #TODO: these two lambdas take quite a bit of time, worth optimizing
                            #row = list(map(lambda arg: int(arg), self.cpp_content[1]))
                            #col = list(map(lambda arg: int(arg), self.cpp_content[2]))
                            # row = [int(x) for x in self.cpp_content[1]]
                            # col = [int(x) for x in self.cpp_content[2]]
                            row = list(self.cpp_content[1].astype(int))
                            col = list(self.cpp_content[2].astype(int))
                            self.sijs = sparse.csr_matrix((val, (row, col)),
                                                          [n, n])
            else:
                raise Exception(
                    "ERROR: Neither ground set data matrix nor similarity kernel provided"
                )

        # if self.partial==None:
        # 	self.partial = False

        if separate_rep == None:
            self.separate_rep = False

        # if self.partial==False:
        self.cpp_ground_sub = {
            -1
        }  #Provide a dummy set for pybind11 binding to be successful

        #Breaking similarity matrix to simpler native data structures for implicit pybind11 binding
        if self.mode == "dense" and create_dense_cpp_kernel_in_python == True:
            if pybind_mode == "list":
                self.cpp_sijs = self.sijs.tolist(
                )  #break numpy ndarray to native list of list datastructure

                if type(self.cpp_sijs[0]) == int or type(
                        self.cpp_sijs[0]
                ) == float:  #Its critical that we pass a list of list to pybind11
                    #This condition ensures the same in case of a 1D numpy array (for 1x1 sim matrix)
                    l = []
                    l.append(self.cpp_sijs)
                    self.cpp_sijs = l

                self.cpp_obj = FacilityLocation(self.n, self.cpp_sijs, False,
                                                self.cpp_ground_sub,
                                                self.separate_rep)
            # elif pybind_mode == "memoryview":
            # 	self.cpp_obj = FacilityLocation(self.n, memoryview(self.sijs), False, self.cpp_ground_sub, self.separate_rep)
            elif pybind_mode == "numpyarray":
                self.cpp_obj = FacilityLocation(self.n, self.sijs, False,
                                                self.cpp_ground_sub,
                                                self.separate_rep)
            elif pybind_mode == "array32":
                # print("Kernel's type = ", self.sijs.dtype)
                self.sijs.astype('float32', copy=False)
                #self.cpp_obj = FacilityLocation2(self.n, self.sijs, False, {-1}, False);
                self.cpp_obj = FacilityLocation2()
                self.cpp_obj.pybind_init(self.n, self.sijs, False, {-1},
                                         self.separate_rep)
            elif pybind_mode == "array64":
                # print("Kernel's type = ", self.sijs.dtype)
                self.sijs.astype('float64', copy=False)
                #self.cpp_obj = FacilityLocation2(self.n, self.sijs, False, {-1}, False);
                self.cpp_obj = FacilityLocation2()
                self.cpp_obj.pybind_init(self.n, self.sijs, False, {-1},
                                         self.separate_rep)
            elif pybind_mode == "array":
                #self.cpp_obj = FacilityLocation2(self.n, self.sijs, False, {-1}, False);
                self.cpp_obj = FacilityLocation2()
                self.cpp_obj.pybind_init(self.n, self.sijs, False, {-1},
                                         self.separate_rep)
            else:
                raise Exception("Invalid pybind mode!")

        elif self.mode == "dense" and create_dense_cpp_kernel_in_python == False:
            if self.separate_rep == True:
                self.cpp_obj = FacilityLocation(self.n, self.data.tolist(),
                                                self.data_rep.tolist(), True,
                                                self.metric)
            else:
                self.cpp_obj = FacilityLocation(self.n, self.data.tolist(),
                                                [[0.]], False, self.metric)

        elif self.mode == "sparse":  #break scipy sparse matrix to native component lists (for csr implementation)
            self.cpp_sijs = {}
            self.cpp_sijs['arr_val'] = self.sijs.data.tolist(
            )  #contains non-zero values in matrix (row major traversal)
            self.cpp_sijs['arr_count'] = self.sijs.indptr.tolist(
            )  #cumulitive count of non-zero elements upto but not including current row
            self.cpp_sijs['arr_col'] = self.sijs.indices.tolist(
            )  #contains col index corrosponding to non-zero values in arr_val
            self.cpp_obj = FacilityLocation(self.n, self.cpp_sijs['arr_val'],
                                            self.cpp_sijs['arr_count'],
                                            self.cpp_sijs['arr_col'])

        elif self.mode == "clustered":
            l_temp = []
            #TODO: this for loop can be optimized
            for el in self.cluster_sijs:
                temp = el.tolist()
                if type(temp[0]) == int or type(temp[0]) == float:
                    l = []
                    l.append(temp)
                    temp = l
                l_temp.append(temp)
            self.cluster_sijs = l_temp

            self.cpp_obj = FacilityLocation(self.n, self.clusters,
                                            self.cluster_sijs,
                                            self.cluster_map)

        #self.cpp_ground_sub=self.cpp_obj.getEffectiveGroundSet()
        #self.ground_sub=self.cpp_ground_sub
        self.effective_ground = self.cpp_obj.getEffectiveGroundSet()