def handleCluster(self, event=None): '''Creates ClusterDialog object.''' print('handleCluster') self.ClusterWindow = ClusterDialog(self.root, self.dataObj) if self.ClusterWindow.headers == None: tkMessageBox.showerror('No File Opened!', 'Please open a file first') return if self.ClusterWindow.result == None: return self.ClusterObjects = [] A = self.dataObj.columns_data(self.ClusterWindow.result[0]) A = A.astype(float) m = np.mean(A, axis=0) D = A - m U, S, V = np.linalg.svd(D, full_matrices=False) projectedData = (V * D.T).T means, codebook, self.errors = analysis.kmeans(self.dataObj, self.ClusterWindow.result[0], self.ClusterWindow.result[1]) c_object = ClusterData(projectedData, self.ClusterWindow.result[0], self.ClusterWindow.result[1], codebook, means, self.errors) self.ClusterObjects.append(c_object) codebook = codebook.T.tolist()[0] self.dataObj.addColumn(codebook, 'cluster', 'numeric') if self.ClusterWindow.result[1] == None: name = 'Cluster' + str(self.ClusterNum) self.ClusterNum += 1 else: name = self.ClusterWindow.result[1] self.ClusterBoxA.insert(tk.END, name)
def handleClustering(self, event=None): if self.data == None: print "Choose input file" return # Create selection dialog box dialog = clusterDialog(self.root, self.data.get_headers(), "Select columns for clustering analysis") # Terminate if cancelled if len(dialog.cols) == 0: return elif len(dialog.cols) == 1: print "Select at least 2 columns for clustering analysis" return try: n = int(dialog.num.get()) except ValueError: print "Please enter an integer for the number of clusters" return if n == 0: print "Please enter a positive integer for the number of clusters" return self.means, ids, self.errors = analysis.kmeans(self.data, dialog.cols, n, dialog.metric) self.clusterData = data.ClusterData(self.data.get_headers(), self.data.get_data(self.data.get_headers())) self.clusterData.add_column(str(n)+"clusterIds", ids) print "Press 'Show clusters' and select the clusterIds column to view clustering"
def build(self, A, categories, K=None, kmeans2=False): '''Builds the classifier give the data points in A and the categories''' # figure out how many categories there are and get the mapping (np.unique) unique, mapping = np.unique(np.array(categories.T), return_inverse=True) self.num_classes = len(unique) self.num_features = A.shape[1] self.class_labels = unique for i in range(self.num_classes): if K is None: # append to exemplars a matrix with all of the rows of A where the category/mapping is i self.exemplars.append(A[(mapping == i), :]) else: if kmeans2: codebook = an.kmeans2(A[(mapping == i), :], self.KNN_headers, K, whiten=False)[0] else: codebook = an.kmeans(A[(mapping == i), :], self.KNN_headers, K, whiten=False)[0] self.exemplars.append(codebook) return
def cluster(self, event = None): if self.data == None: tkMessageBox.showwarning("Instructions", "You must read in a file first") return headers = self.data.get_headers() output = ClusterSelect(self.root, headers) if output.okField == False: self.label['text'] = "Data selection cancelled" return if output.result == False: return print output.result codes = analysis.kmeans(self.data, output.result[0], output.result[1], whiten=True, categories = '')[1] newCol = ["Categories", "numeric"] codes = codes.T.tolist()[0] for code in codes: newCol.append(code) self.data.addData(newCol) newHeaders =output.result[0] newHeaders.append("Categories") print "Cheking if checked" print output.result[1] d = self.handlePlotData(newHeaders, output.result[2], codes) # removes the added Categories column once finished self.data.removeLast()
def handleClustering(self,event=None): self.cbox = ClusterDialogBox(self.root,self.data.get_headers()) self.numclusters=int(self.cbox.clusterNum) self.smoothclrscheme=self.cbox.getContinuousColorScheme() if len(self.cbox.datacols)!=0 and self.numclusters>0: headers=[] for head in self.cbox.datacols: headers.append(head) self.codebook, self.codes, self.errors = an.kmeans(self.data,headers,self.numclusters) self.clusterList.append([self.codebook, self.codes, self.errors,headers]) if self.cbox.getName() == None: name = "Cluster #" + str(self.num_cluster) self.num_cluster += 1 else: name = self.cbox.getName() self.clusterWindow.insert(tk.END, name) print name if self.cbox.headers == None: print "Select a file" self.handleOpen() if self.cbox.getDatacols() == None: return
def build(self, A, categories, K=None): '''Builds the classifier give the data points in A and the categories''' # figure out how many categories there are and get the mapping (np.unique) # print(A.get_data()) self.numCategories = len(categories) self.numFeatures = A.get_data().shape[1] self.unique, self.mapping, self.counts = np.unique(np.array( categories.T), return_inverse=True, return_counts=True) self.numClasses = self.unique.size self.ogLabels = self.unique self.exemplars = [ A.get_data()[(self.mapping == i), :] for i in range(self.numClasses) ] if K != None: temp = [] for e in self.exemplars: codebook, codes, error, quality = an.kmeans(None, headers=None, whiten=False, K=K, A=e) temp.append(codebook) self.exemplars = temp return
def build(self, A, categories, K=None): '''Builds the classifier give the data points in A and the categories''' # figure out how many categories there are and get the mapping (np.unique) # for each category i, build the set of exemplars # if K is None # append to exemplars a matrix with all of the rows of A where the category/mapping is i # else # run K-means on the rows of A where the category/mapping is i # append the codebook to the exemplars # store any other necessary information: # of classes, # of features, original labels unique, mapping = np.unique(np.array(categories.T), return_inverse=True) self.num_class = len(unique) self.num_feature = A.shape[1] self.class_labels = unique for i in range(self.num_class): if K is None: self.exemplar.append(A[(mapping == i), :]) else: codebook, codes, errors = an.kmeans(A[(mapping == i), :], self.headers, K, whiten=False) self.exemplar.append(codebook) return
def main(): d = data.Data("datasets/clusterdata.csv") codebook, codes, errors = an.kmeans(d, d.getHeaderNum(), 2) print "\nCodebook\n" print codebook print "\nCodes\n" print codes
def handleCluster(self): d = ClusterDialog(self.root, self.data.get_headers()) if d.result != None and int(d.clusters) > 0: headers = [] for index in d.result: headers.append(self.data.get_headers()[index]) codebook, codes, errors = analysis.kmeans(self.data, headers, int(d.clusters)) self.clusterCount += 1 self.data.addColumn("Clusters %d" % (self.clusterCount,), codes) self.headers = self.data.get_headers()
def handleCluster(self): d = clusterDialog(self.root, self.data.getHeaders()) if d.result != None and int(d.clusters) > 0: print d.result headers = [] for index in d.result: headers.append(self.data.headersNumeric[index]) codebook, codes, errors = analysis.kmeans(self.data, headers, int(d.clusters)) print codes self.clusterCount += 1 print self.clusterCount self.data.addColumn("Clusters %d" % (self.clusterCount,), codes) self.buildOptions()
def cluster(self): if self.data is None: print 'you have no data' return variables = Dialogs.ClusterDialog(self.root, self.data.get_headers()) if variables.result == []: print 'you didn\'t pick anything' return self.clusternumber+=1 #print variables.numclusters #self.uniqueColors = variables.distColors #print self.uniqueColors codebook, codes, errors = analysis.kmeans(self.data, variables.result, variables.numclusters) #I add the col to the data so I can cluster the data itself to the PCA, rather than the transformed data self.data.add_column('clustering%d'%(self.clusternumber),'numeric', codes)
def clusterPCA(self): if self.PCAanalysis != [] and self.PCAListbox.curselection() != (): self.PCA = self.PCAanalysis[self.PCAListbox.curselection()[0]] if self.PCA is None: print 'you have no data' return variables = Dialogs.ClusterDialog(self.root, self.PCA.get_headers()) if variables.result == []: print 'you didn\'t pick anything' return self.clusternumber+=1 #print variables.numclusters #self.uniqueColors = variables.distColors #print self.uniqueColors codebook, codes, errors = analysis.kmeans(self.PCA, variables.result, variables.numclusters) self.PCA.add_column('clustering%d'%(self.clusternumber),'numeric', codes)
def main(argv): if len(argv) < 2: print( 'Usage: python %s <all numeric CSV file>' % (argv[0])) exit(-1) try: d = data.Data( argv[1] ) except: print( 'Unable to open %s' % (argv[1])) exit(-1) codebook, codes, errors = an.kmeans( d, d.get_headers(), 2 ) print( "\nCodebook\n") print( codebook) print( "\nCodes\n") print( codes)
def main(argv): if len(argv) < 2: print('Usage: python %s <all numeric CSV file>' % (argv[0])) exit(-1) try: d = data.Data( argv[1] ) except: print('Unable to open %s' % (argv[1])) exit(-1) codebook, codes, errors = an.kmeans( d, d.get_headers(), 2 ) print("\nCodebook\n") print(codebook) print("\nCodes\n") print(codes)
def main(argv): # if len(argv) < 2: # print 'Usage: python %s <all numeric CSV file>' % (argv[0]) # exit(-1) # # try: # d = data.Data( argv[1] ) # except: # print 'Unable to open %s' % (argv[1]) # exit(-1) d = data.Data("clusterdata.csv") codebook, codes, errors = an.kmeans(d, d.get_headers(), 2) print "\nCodebook\n" print codebook print "\nCodes\n" print codes
def handleKMeans(self): #this line cannot go after the dialogs are created because if the user #types something into the dialogs, it will deselect the columns self.headers = self.cols.curselection() nd = NameDialog(self.root) name = nd.get_name() kd = KDialog(self.root) self.k = kd.get_k() for i in range(len(self.headers)): if self.headers[i] == 'None': del self.headers[i] print("headers to cluster:", self.headers) codebook, codes, errors, quality = analysis.kmeans(self.data, self.headers, self.k, whiten=False, manhattan=self.manhattan) print("MDL:",quality) self.kmeans.append([codebook, codes, errors, quality]) if name == "Default": self.PCAlistbox.insert(tk.END, "K-Means" + str(self.PCAlistbox.size())) else: self.PCAlistbox.insert(tk.END, name)
def main(argv): # if len(argv) < 2: # print 'Usage: python %s <all numeric CSV file>' % (argv[0]) # exit(-1) # # try: # d = data.Data( argv[1] ) # except: # print 'Unable to open %s' % (argv[1]) # exit(-1) d = data.Data( "clusterdata.csv" ) codebook, codes, errors = an.kmeans( d, d.get_headers(), 2 ) print "\nCodebook\n" print codebook print "\nCodes\n" print codes
def build(self, A, categories, K=None): '''Builds the classifier give the data points in A and the categories''' # figure out how many categories there are and get the mapping (np.unique) unique, mapping = np.unique(np.array(categories.T), return_inverse=True) self.num_classes = len(unique) self.NumFeature = A.shape[1] self.class_labels = unique for i in range(self.num_classes): if K is None: self.exemplars.append(A[(mapping == i), :]) else: codebook, codes, errors = an.kmeans(A[(mapping == i), :], self.headers, K, whiten=False) '''extension2, using other method to get exemplars''' self.exemplars.append(codebook)
def main(argv): if len(argv) < 3: print 'usage: python %s <Train CSV file> <Train categories CSV file>' % (argv[0]) exit(-1) # read the features and categories data sets print 'Reading %s and %s' % (argv[1], argv[2]) try: d = data.Data(argv[1]) except: print 'Unable to open %s' % (argv[1]) exit(-1) try: catdata = data.Data(argv[2]) except: print 'Unable to open %s' % (argv[2]) exit(-1) # execute PCA analysis print 'Executing PCA' pcadata = an.pca( d, d.get_headers(), False ) print 'Evaluating eigenvalues' # identify how many dimensions it takes to represent 90% of the variation evals = pcadata.get_eigenvalues() #print "type:",type(evals) evals=np.asmatrix(evals) #print "type2:",type(evals) #print "shape: ",evals.shape esum = np.sum(evals) cum = evals[0,0] cumper = cum / esum i = 1 while cumper < 0.999: cum += evals[0,i] cumper = cum/esum i += 1 print 'Dimensions to reach 99.9% of variation:', i cheaders = pcadata.get_headers()[:i] # cluster the data K = 6 # Use the average of each category as the initial means truecats = catdata.get_data(catdata.get_headers()[0:1]) #tmpcats = truecats - 1 tmpcats = truecats # Don't adjust if we're using corrected labels print 'Clustering to %d clusters' % (K) codebook, codes, errors = an.kmeans(pcadata, cheaders, K, categories = tmpcats) # build a confusion matrix confmtx = [[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0]] for i in range(codes.shape[0]): #confmtx[codes[i,0]][int(truecats[i,0])-1] += 1 confmtx[codes[i,0]][int(truecats[i,0])] += 1 # don't adjust print "\nConfusion Matrix:\n" print 'Actual-> Walking Walk-up Walk-dwn Sitting Standing Laying' for i in range(len(confmtx)): s = 'Cluster %d' % (i) for val in confmtx[i]: s += "%10d" % (val) print s print
# Calculate alpha for each fracture population on every outcrop data2 = calcAlpha(data2) dataBeni = calcAlpha(dataBeni) # Weighting the data by density. Approximate all outcrops as 10 m radius circular disks data2 = weightByN(data2) data2 = data2.sort_values('Id', ascending = True) dataBeni = weightByN(dataBeni) dataBeni = dataBeni.sort_values('Id', ascending = True) # Calculate the centers of n clusters strikes = data2['strike'].as_matrix() dips = data2['dip'].as_matrix() centers1 = analysis.kmeans(strikes, dips, num=5, bidirectional=True, measurement='poles', tolerance=1e-15) cstrikes = np.zeros(1) cdips = np.zeros(1) counter = 0 for i in centers1: cstrike, cdip = stereonet_math.geographic2pole(i[0],i[1]) cstrikes = np.append(cstrikes,cstrike) cdips = np.append(cdips,cdip) counter += 1 cstrikes = cstrikes[1:] cdips = cdips[1:] # Classify the data into populations by comparing the poles with a set of test poles
def main(argv): if len(argv) < 3: print 'usage: python %s <Train CSV file> <Train categories CSV file>' % (argv[0]) exit(-1) # read the features and categories data sets print 'Reading %s and %s' % (argv[1], argv[2]) try: d = data.Data(argv[1]) except: print 'Unable to open %s' % (argv[1]) exit(-1) try: catdata = data.Data(argv[2]) except: print 'Unable to open %s' % (argv[2]) exit(-1) # execute PCA analysis print 'Executing PCA' pcadata = an.pca( d.getHeaderRaw(), d ) print 'Evaluating eigenvalues' # identify how many dimensions it takes to represent 90% of the variation evals = pcadata.getEigenvalues() esum = np.sum(evals) cum = evals[0] cumper = cum / esum i = 1 while cumper < .999: cum += evals[i] cumper = cum/esum i += 1 print 'Dimensions to reach 99.9% of variation:', i cheaders = pcadata.getHeaderRaw()[:i] # cluster the data K = 6 # Use the average of each category as the initial means truecats = catdata.getDataNum(catdata.getHeaderRaw()[0:1]) tmpcats = truecats - 1 print "categoroties:", #tmpcats.tolist() print 'Clustering to %d clusters' % (K) # print pcadata # print cheaders # print K codebook, codes, errors = an.kmeans(pcadata, cheaders, K, categories = tmpcats) print len(codes) print codebook # build a confusion matrix confmtx = [[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0]] codes = codes.astype(int) truecats = truecats.astype(int) print len(codes) print len(truecats) for i in range(codes.shape[0]): confmtx[codes[i,0]][int(truecats[i,0])-1] += 1 print "\nConfusion Matrix:\n" print 'Actual-> Walking Walk-up Walk-dwn Sitting Standing Laying' for i in range(len(confmtx)): s = 'Cluster %d' % (i) for val in confmtx[i]: s += "%10d" % (val) print s print
def main(argv): if len(argv) < 3: print('usage: python %s <Train CSV file> <Train categories CSV file>' % (argv[0])) exit(-1) # read the features and categories data sets print('Reading %s and %s' % (argv[1], argv[2])) try: d = data.Data(argv[1]) except: print('Unable to open %s' % (argv[1])) exit(-1) try: catdata = data.Data(argv[2]) except: print('Unable to open %s' % (argv[2])) exit(-1) # execute PCA analysis print('Executing PCA') pcadata = an.pca( d, d.get_headers() ) print('Evaluating eigenvalues') # identify how many dimensions it takes to represent 90% of the variation evals = pcadata.get_eigenvalues() esum = np.sum(evals) cum = evals[0,0] cumper = cum / esum i = 1 while cumper < 0.9: cum += evals[0,i] cumper = cum/esum i += 1 print('Dimensions to reach 90% of variation:', i) cheaders = pcadata.get_headers()[:i] # cluster the data K = 6 # Use the average of each category as the initial means truecats = catdata.get_data(catdata.get_headers()[0:1]) tmpcats = truecats - 1 print('Clustering to %d clusters' % (K)) codebook, codes, errors = an.kmeans(pcadata, cheaders, K, categories=tmpcats) # build a confusion matrix confmtx = [[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,0,0]] for i in range(codes.shape[0]): confmtx[int(codes[i, 0])][int(truecats[i, 0])-1] += 1 print("\nConfusion Matrix:\n") print('Actual-> Walking Walk-up Walk-dwn Sitting Standing Laying') for i in range(len(confmtx)): s = 'Cluster %d' % (i) for val in confmtx[i]: s += "%10d" % (val) print(s) print()