예제 #1
0
    def handleCluster(self, event=None):
        '''Creates ClusterDialog object.'''
        print('handleCluster')
        self.ClusterWindow = ClusterDialog(self.root, self.dataObj)

        if self.ClusterWindow.headers == None:
            tkMessageBox.showerror('No File Opened!', 'Please open a file first')
            return

        if self.ClusterWindow.result == None:
            return

        self.ClusterObjects = []

        A = self.dataObj.columns_data(self.ClusterWindow.result[0])
        A = A.astype(float)
        m = np.mean(A, axis=0)
        D = A - m
        U, S, V = np.linalg.svd(D, full_matrices=False)
        projectedData = (V * D.T).T

        means, codebook, self.errors = analysis.kmeans(self.dataObj, self.ClusterWindow.result[0], self.ClusterWindow.result[1])
        c_object = ClusterData(projectedData, self.ClusterWindow.result[0], self.ClusterWindow.result[1], codebook, means, self.errors)
        self.ClusterObjects.append(c_object)
        codebook = codebook.T.tolist()[0]
        self.dataObj.addColumn(codebook, 'cluster', 'numeric')
        if self.ClusterWindow.result[1] == None:
            name = 'Cluster' + str(self.ClusterNum)
            self.ClusterNum += 1
        else:
            name = self.ClusterWindow.result[1]

        self.ClusterBoxA.insert(tk.END, name)
예제 #2
0
 def handleClustering(self, event=None):
     if self.data == None:
         print "Choose input file"
         return
     
     # Create selection dialog box
     dialog = clusterDialog(self.root, self.data.get_headers(), 
                                 "Select columns for clustering analysis")
     # Terminate if cancelled
     if len(dialog.cols) == 0:
         return
     elif len(dialog.cols) == 1:
         print "Select at least 2 columns for clustering analysis"
         return
     
     try:
         n = int(dialog.num.get())
     except ValueError:
         print "Please enter an integer for the number of clusters"
         return
     
     if n == 0:
         print "Please enter a positive integer for the number of clusters"
         return
 
     self.means, ids, self.errors = analysis.kmeans(self.data, 
                                             dialog.cols, n, dialog.metric)
     
     self.clusterData = data.ClusterData(self.data.get_headers(), 
                             self.data.get_data(self.data.get_headers()))
     self.clusterData.add_column(str(n)+"clusterIds", ids)
     print "Press 'Show clusters' and select the clusterIds column to view clustering"
예제 #3
0
파일: classifiers.py 프로젝트: jydong/TagL
    def build(self, A, categories, K=None, kmeans2=False):
        '''Builds the classifier give the data points in A and the categories'''

        # figure out how many categories there are and get the mapping (np.unique)
        unique, mapping = np.unique(np.array(categories.T),
                                    return_inverse=True)
        self.num_classes = len(unique)
        self.num_features = A.shape[1]
        self.class_labels = unique

        for i in range(self.num_classes):
            if K is None:
                # append to exemplars a matrix with all of the rows of A where the category/mapping is i
                self.exemplars.append(A[(mapping == i), :])
            else:
                if kmeans2:
                    codebook = an.kmeans2(A[(mapping == i), :],
                                          self.KNN_headers,
                                          K,
                                          whiten=False)[0]
                else:
                    codebook = an.kmeans(A[(mapping == i), :],
                                         self.KNN_headers,
                                         K,
                                         whiten=False)[0]
                self.exemplars.append(codebook)

        return
예제 #4
0
	def cluster(self, event = None):
		if self.data == None:
			tkMessageBox.showwarning("Instructions", "You must read in a file first")
			return
		headers = self.data.get_headers()
		output = ClusterSelect(self.root, headers)
		if output.okField == False:
			self.label['text'] = "Data selection cancelled"
			return
		if output.result == False:
			return	
		print output.result
		codes = analysis.kmeans(self.data, output.result[0], output.result[1], whiten=True, categories = '')[1]
		newCol = ["Categories", "numeric"]
		codes = codes.T.tolist()[0]
		for code in codes:
			newCol.append(code)
		self.data.addData(newCol)
		newHeaders =output.result[0]
		newHeaders.append("Categories")
		print "Cheking if checked"
		print output.result[1]
		d = self.handlePlotData(newHeaders, output.result[2], codes)
		
		# removes the added Categories column once finished
		self.data.removeLast()
예제 #5
0
	def handleClustering(self,event=None):
		self.cbox = ClusterDialogBox(self.root,self.data.get_headers())
		self.numclusters=int(self.cbox.clusterNum)
		self.smoothclrscheme=self.cbox.getContinuousColorScheme()
		if len(self.cbox.datacols)!=0 and self.numclusters>0:
			headers=[]
			for head in self.cbox.datacols:
				headers.append(head)
			self.codebook, self.codes, self.errors = an.kmeans(self.data,headers,self.numclusters)
			self.clusterList.append([self.codebook, self.codes, self.errors,headers])
		if self.cbox.getName() == None:
			name = "Cluster #" + str(self.num_cluster)
			self.num_cluster += 1

		else:
			name = self.cbox.getName()
		self.clusterWindow.insert(tk.END, name)
		print name


		if self.cbox.headers == None:
			print "Select a file"
			self.handleOpen()

		if self.cbox.getDatacols() == None:
			return
예제 #6
0
    def build(self, A, categories, K=None):
        '''Builds the classifier give the data points in A and the categories'''

        # figure out how many categories there are and get the mapping (np.unique)
        # print(A.get_data())
        self.numCategories = len(categories)
        self.numFeatures = A.get_data().shape[1]
        self.unique, self.mapping, self.counts = np.unique(np.array(
            categories.T),
                                                           return_inverse=True,
                                                           return_counts=True)
        self.numClasses = self.unique.size
        self.ogLabels = self.unique
        self.exemplars = [
            A.get_data()[(self.mapping == i), :]
            for i in range(self.numClasses)
        ]
        if K != None:
            temp = []
            for e in self.exemplars:
                codebook, codes, error, quality = an.kmeans(None,
                                                            headers=None,
                                                            whiten=False,
                                                            K=K,
                                                            A=e)
                temp.append(codebook)
            self.exemplars = temp
        return
예제 #7
0
    def build(self, A, categories, K=None):
        '''Builds the classifier give the data points in A and the categories'''

        # figure out how many categories there are and get the mapping (np.unique)
        # for each category i, build the set of exemplars
        # if K is None
        # append to exemplars a matrix with all of the rows of A where the category/mapping is i
        # else
        # run K-means on the rows of A where the category/mapping is i
        # append the codebook to the exemplars

        # store any other necessary information: # of classes, # of features, original labels

        unique, mapping = np.unique(np.array(categories.T),
                                    return_inverse=True)
        self.num_class = len(unique)
        self.num_feature = A.shape[1]

        self.class_labels = unique
        for i in range(self.num_class):
            if K is None:
                self.exemplar.append(A[(mapping == i), :])
            else:
                codebook, codes, errors = an.kmeans(A[(mapping == i), :],
                                                    self.headers,
                                                    K,
                                                    whiten=False)
                self.exemplar.append(codebook)

        return
예제 #8
0
def main():
    d = data.Data("datasets/clusterdata.csv")

    codebook, codes, errors = an.kmeans(d, d.getHeaderNum(), 2)

    print "\nCodebook\n"
    print codebook

    print "\nCodes\n"
    print codes
예제 #9
0
	def handleCluster(self):
		d = ClusterDialog(self.root, self.data.get_headers())
		if d.result != None and int(d.clusters) > 0:
			headers = []
			for index in d.result:
				headers.append(self.data.get_headers()[index])
			codebook, codes, errors = analysis.kmeans(self.data, headers, int(d.clusters))
			self.clusterCount += 1
			self.data.addColumn("Clusters %d" % (self.clusterCount,), codes)
			self.headers = self.data.get_headers()
예제 #10
0
 def handleCluster(self):
     d = clusterDialog(self.root, self.data.getHeaders())
     if d.result != None and int(d.clusters) > 0:
         print d.result
         headers = []
         for index in d.result:
             headers.append(self.data.headersNumeric[index])
         codebook, codes, errors = analysis.kmeans(self.data, headers, int(d.clusters))
         print codes
         self.clusterCount += 1
         print self.clusterCount
         self.data.addColumn("Clusters %d" % (self.clusterCount,), codes)
         self.buildOptions()
예제 #11
0
	def cluster(self):
		if self.data is None:
			print 'you have no data'
			return
		variables = Dialogs.ClusterDialog(self.root, self.data.get_headers())
		if variables.result == []:
			print 'you didn\'t pick anything'
			return
		self.clusternumber+=1
		#print variables.numclusters
		#self.uniqueColors = variables.distColors
		#print self.uniqueColors
		codebook, codes, errors = analysis.kmeans(self.data, variables.result, variables.numclusters)
		#I add the col to the data so I can cluster the data itself to the PCA, rather than the transformed data
		self.data.add_column('clustering%d'%(self.clusternumber),'numeric', codes)
예제 #12
0
	def clusterPCA(self):
		if self.PCAanalysis != [] and self.PCAListbox.curselection() != ():
			self.PCA = self.PCAanalysis[self.PCAListbox.curselection()[0]]
		
		if self.PCA is None:
			print 'you have no data'
			return
		variables = Dialogs.ClusterDialog(self.root, self.PCA.get_headers())
		if variables.result == []:
			print 'you didn\'t pick anything'
			return
		self.clusternumber+=1
		#print variables.numclusters
		#self.uniqueColors = variables.distColors
		#print self.uniqueColors
		codebook, codes, errors = analysis.kmeans(self.PCA, variables.result, variables.numclusters)
		self.PCA.add_column('clustering%d'%(self.clusternumber),'numeric', codes)
def main(argv):
    if len(argv) < 2:
        print( 'Usage: python %s <all numeric CSV file>' % (argv[0]))
        exit(-1)

    try:
        d = data.Data( argv[1] )
    except:
        print( 'Unable to open %s' % (argv[1]))
        exit(-1)

    codebook, codes, errors = an.kmeans( d, d.get_headers(), 2 )

    print( "\nCodebook\n")
    print( codebook)

    print( "\nCodes\n")
    print( codes)
def main(argv):
    if len(argv) < 2:
        print('Usage: python %s <all numeric CSV file>' % (argv[0]))
        exit(-1)

    try:
        d = data.Data( argv[1] )
    except:
        print('Unable to open %s' % (argv[1]))
        exit(-1)

    codebook, codes, errors = an.kmeans( d, d.get_headers(), 2 )

    print("\nCodebook\n")
    print(codebook)

    print("\nCodes\n")
    print(codes)
예제 #15
0
def main(argv):
    # if len(argv) < 2:
    #		  print 'Usage: python %s <all numeric CSV file>' % (argv[0])
    #		  exit(-1)
    #
    #	  try:
    #		  d = data.Data( argv[1] )
    #	  except:
    #		  print 'Unable to open %s' % (argv[1])
    #		  exit(-1)
    d = data.Data("clusterdata.csv")

    codebook, codes, errors = an.kmeans(d, d.get_headers(), 2)

    print "\nCodebook\n"
    print codebook

    print "\nCodes\n"
    print codes
예제 #16
0
 def handleKMeans(self):
     #this line cannot go after the dialogs are created because if the user
     #types something into the dialogs, it will deselect the columns
     self.headers = self.cols.curselection()
     nd = NameDialog(self.root)
     name = nd.get_name()
     kd = KDialog(self.root)
     self.k = kd.get_k()
     for i in range(len(self.headers)):
         if self.headers[i] == 'None':
             del self.headers[i]
     print("headers to cluster:", self.headers)
     codebook, codes, errors, quality = analysis.kmeans(self.data, self.headers, self.k, whiten=False, manhattan=self.manhattan)
     print("MDL:",quality)
     self.kmeans.append([codebook, codes, errors, quality])
     if name == "Default":
         self.PCAlistbox.insert(tk.END, "K-Means" + str(self.PCAlistbox.size()))
     else:
         self.PCAlistbox.insert(tk.END, name)
예제 #17
0
def main(argv):
	# if len(argv) < 2:
#		  print 'Usage: python %s <all numeric CSV file>' % (argv[0])
#		  exit(-1)
# 
#	  try:
#		  d = data.Data( argv[1] )
#	  except:
#		  print 'Unable to open %s' % (argv[1])
#		  exit(-1)
	d = data.Data( "clusterdata.csv" )

	codebook, codes, errors = an.kmeans( d, d.get_headers(), 2 )

	print "\nCodebook\n"
	print codebook

	print "\nCodes\n"
	print codes
예제 #18
0
    def build(self, A, categories, K=None):
        '''Builds the classifier give the data points in A and the categories'''

        # figure out how many categories there are and get the mapping (np.unique)
        unique, mapping = np.unique(np.array(categories.T),
                                    return_inverse=True)
        self.num_classes = len(unique)
        self.NumFeature = A.shape[1]

        self.class_labels = unique

        for i in range(self.num_classes):
            if K is None:
                self.exemplars.append(A[(mapping == i), :])
            else:
                codebook, codes, errors = an.kmeans(A[(mapping == i), :],
                                                    self.headers,
                                                    K,
                                                    whiten=False)
                '''extension2, using other method to get exemplars'''
                self.exemplars.append(codebook)
예제 #19
0
def main(argv):
    
    if len(argv) < 3:
        print 'usage: python %s <Train CSV file> <Train categories CSV file>' % (argv[0])
        exit(-1)

    # read the features and categories data sets
    print 'Reading %s and %s' % (argv[1], argv[2])
    try:
        d = data.Data(argv[1])
    except:
        print 'Unable to open %s' % (argv[1])
        exit(-1)

        
    try:
        catdata = data.Data(argv[2])
    except:
        print 'Unable to open %s' % (argv[2])
        exit(-1)

    
    # execute PCA analysis
    print 'Executing PCA'
    pcadata = an.pca( d, d.get_headers(), False )

    print 'Evaluating eigenvalues'
    # identify how many dimensions it takes to represent 90% of the variation
    evals = pcadata.get_eigenvalues()
    #print "type:",type(evals)
    evals=np.asmatrix(evals)
    #print "type2:",type(evals)
    #print "shape:  ",evals.shape
    esum = np.sum(evals)
    
    cum = evals[0,0]
    cumper = cum / esum
    i = 1
    while cumper < 0.999:
        cum += evals[0,i]
        cumper = cum/esum
        i += 1

    print 'Dimensions to reach 99.9% of variation:', i

    cheaders = pcadata.get_headers()[:i]

    # cluster the data
    K = 6

    # Use the average of each category as the initial means
    truecats = catdata.get_data(catdata.get_headers()[0:1])
    #tmpcats = truecats - 1 
    tmpcats = truecats # Don't adjust if we're using corrected labels
    
    print 'Clustering to %d clusters' % (K)
    codebook, codes, errors = an.kmeans(pcadata, cheaders, K, categories = tmpcats)
        
    # build a confusion matrix
    confmtx = [[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0]]
    for i in range(codes.shape[0]):
        #confmtx[codes[i,0]][int(truecats[i,0])-1] += 1
        confmtx[codes[i,0]][int(truecats[i,0])] += 1 # don't adjust

    print "\nConfusion Matrix:\n"
    print 'Actual->     Walking   Walk-up   Walk-dwn  Sitting   Standing   Laying'
    for i in range(len(confmtx)):
        s = 'Cluster %d' % (i)
        for val in confmtx[i]:
            s += "%10d" % (val)
        print s
    print
예제 #20
0
# Calculate alpha for each fracture population on every outcrop
data2 = calcAlpha(data2)
dataBeni = calcAlpha(dataBeni)

# Weighting the data by density. Approximate all outcrops as 10 m radius circular disks
data2 = weightByN(data2)
data2 = data2.sort_values('Id', ascending = True)

dataBeni = weightByN(dataBeni)
dataBeni = dataBeni.sort_values('Id', ascending = True)

# Calculate the centers of n clusters
strikes = data2['strike'].as_matrix()
dips = data2['dip'].as_matrix()
centers1 = analysis.kmeans(strikes, dips, num=5, bidirectional=True, measurement='poles', tolerance=1e-15)

cstrikes = np.zeros(1)
cdips = np.zeros(1)
counter = 0

for i in centers1:
    cstrike, cdip = stereonet_math.geographic2pole(i[0],i[1])
    cstrikes = np.append(cstrikes,cstrike)
    cdips = np.append(cdips,cdip)
    counter += 1

cstrikes = cstrikes[1:]
cdips = cdips[1:]

# Classify the data into populations by comparing the poles with a set of test poles
예제 #21
0
def main(argv):

    if len(argv) < 3:
        print 'usage: python %s <Train CSV file> <Train categories CSV file>' % (argv[0])
        exit(-1)

    # read the features and categories data sets
    print 'Reading %s and %s' % (argv[1], argv[2])
    try:
        d = data.Data(argv[1])
    except:
        print 'Unable to open %s' % (argv[1])
        exit(-1)


    try:
        catdata = data.Data(argv[2])
    except:
        print 'Unable to open %s' % (argv[2])
        exit(-1)


    # execute PCA analysis
    print 'Executing PCA'
    pcadata = an.pca( d.getHeaderRaw(), d )

    print 'Evaluating eigenvalues'
    # identify how many dimensions it takes to represent 90% of the variation
    evals = pcadata.getEigenvalues()
    esum = np.sum(evals)
    cum = evals[0]
    cumper = cum / esum
    i = 1
    while cumper < .999:
        cum += evals[i]
        cumper = cum/esum
        i += 1

    print 'Dimensions to reach 99.9% of variation:', i

    cheaders = pcadata.getHeaderRaw()[:i]

    # cluster the data
    K = 6

    # Use the average of each category as the initial means
    truecats = catdata.getDataNum(catdata.getHeaderRaw()[0:1])
    tmpcats = truecats - 1

    print "categoroties:", #tmpcats.tolist()

    print 'Clustering to %d clusters' % (K)
    # print pcadata
    # print cheaders
    # print K
    codebook, codes, errors = an.kmeans(pcadata, cheaders, K, categories = tmpcats)

    print len(codes)
    print codebook

    # build a confusion matrix
    confmtx = [[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0]]

    codes = codes.astype(int)
    truecats = truecats.astype(int)

    print len(codes)
    print len(truecats)

    for i in range(codes.shape[0]):
        confmtx[codes[i,0]][int(truecats[i,0])-1] += 1

    print "\nConfusion Matrix:\n"
    print 'Actual->     Walking   Walk-up   Walk-dwn  Sitting   Standing   Laying'
    for i in range(len(confmtx)):
        s = 'Cluster %d' % (i)
        for val in confmtx[i]:
            s += "%10d" % (val)
        print s
    print
def main(argv):
    
    if len(argv) < 3:
        print('usage: python %s <Train CSV file> <Train categories CSV file>' % (argv[0]))
        exit(-1)

    # read the features and categories data sets
    print('Reading %s and %s' % (argv[1], argv[2]))
    try:
        d = data.Data(argv[1])
    except:
        print('Unable to open %s' % (argv[1]))
        exit(-1)

        
    try:
        catdata = data.Data(argv[2])
    except:
        print('Unable to open %s' % (argv[2]))
        exit(-1)

    
    # execute PCA analysis
    print('Executing PCA')
    pcadata = an.pca( d, d.get_headers() )

    print('Evaluating eigenvalues')
    # identify how many dimensions it takes to represent 90% of the variation
    evals = pcadata.get_eigenvalues()
    esum = np.sum(evals)
    cum = evals[0,0]
    cumper = cum / esum
    i = 1
    while cumper < 0.9:
        cum += evals[0,i]
        cumper = cum/esum
        i += 1

    print('Dimensions to reach 90% of variation:', i)

    cheaders = pcadata.get_headers()[:i]

    # cluster the data
    K = 6

    # Use the average of each category as the initial means
    truecats = catdata.get_data(catdata.get_headers()[0:1])
    tmpcats = truecats - 1

    print('Clustering to %d clusters' % (K))
    codebook, codes, errors = an.kmeans(pcadata, cheaders, K, categories=tmpcats)
        
    # build a confusion matrix
    confmtx = [[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0]]
    for i in range(codes.shape[0]):
        confmtx[int(codes[i, 0])][int(truecats[i, 0])-1] += 1

    print("\nConfusion Matrix:\n")
    print('Actual->     Walking   Walk-up   Walk-dwn  Sitting   Standing   Laying')
    for i in range(len(confmtx)):
        s = 'Cluster %d' % (i)
        for val in confmtx[i]:
            s += "%10d" % (val)
        print(s)
    print()