def buildProductClusters(): global transpose transpose = c.matrix.transpose() cl.__init__(transpose, p.products) catNum = len(p.products)/8 + 1 outputs = cl.kMeans(catNum,8) return outputs
def createSubcluster(indexMap, subMatrix, aMap): cl.__init__(subMatrix, c.customers, aMap) clust = [] results = cl.kMeans(25,8) clusters = results[0] clust.append(clusters) centroids = results[1] clust.append(centroids) clust.append(cl.clusterMap) clust.append(indexMap) clust.append(s.averageSilhouettes(clust[0], subMatrix)) clust.append(aMap) return clust
def run(names): global products products = p.products results = [names, c.customersMap] global transpose transpose = c.matrix.transpose() cl.__init__(transpose, p.products) catNum = len(p.products)/8 + 1 outputs = cl.kMeans(catNum,8) prodClusters = outputs[0] centroids = outputs[1] inputs = st.subMatrices(prodClusters) prodClusters = n.normalizeProdClusters(prodClusters, centroids, inputs[0], inputs[1], 0.2, 0.4) results.append(prodClusters) inputs = st.subMatrices(prodClusters) subMats = inputs[0] maps = inputs[1] indexMap = inputs[2] subClusters = [] for i in range(0, len(subMats)): subCluster = st.createSubcluster(indexMap[i], subMats[i], maps[i]) subCluster.append(r.buildRecommendations(names, [subCluster])) subClusters.append(subCluster) totCluster = st.createSubcluster(p.products, c.matrix, p.productsMap) totCluster.append(r.buildRecommendations(names,[totCluster])) powerClusters = [] powerSil = [] results.append('unfiltered results: ' + str(totCluster[4])) for i in range(0, len(subClusters)): if subClusters[i][4] >= totCluster[4]: powerClusters.append(subClusters[i]) powerSil.append(subClusters[i][4]) if(len(powerSil) == 0): return 'again' else: results.append('filtered average: ' + str(sum(powerSil)/len(powerSil))) powerClusters.append(totCluster) recommendationMatrix = r.buildRecommendations(names, powerClusters) results.append(recommendationMatrix) results.append(powerClusters) results.append(subClusters) return results
def createClusterHelpers(indexMap, subMatrix, aMap): cl.__init__(subMatrix, c.customers, aMap) clust = [] results = cl.kMeans(25,8) clusters = results[0] # index 0 clust.append(clusters) centroids = results[1] # index 1 clust.append(centroids) # index 2 clust.append(cl.clusterMap) # index 3 clust.append(indexMap) avgSils = s.averageSilhouettes(clust[0], subMatrix, centroids) # index 4 clust.append(s.silhouettesList) # index 5 clust.append(avgSils) return clust
def dissolve(clusts, centroids, mats, maps, i): trans = mats[i].transpose() cl.__init__(trans, clusts[i], maps[i]) num = len(clusts[i])/8+1 results = cl.kMeans(num, 20) pClusts = results[0] pCents = results[1] clusts.pop(i) centroids.pop(i) mats.pop(i) maps.pop(i) for j in range(0, len(pClusts)): clusts.append(pClusts[j]) centroids.append(pCents[j]) newMat = [] newMap = {} st.redoMatrix(clusts,len(clusts)-1,newMat, newMap) mats.append(newMat) maps.append(newMap)
def dissolve(clusts, centroids, mats, maps, i): trans = mats[i].transpose() cl.__init__(trans, clusts[i], maps[i]) num = len(clusts[i]) / 8 + 1 results = cl.kMeans(num, 20) pClusts = results[0] pCents = results[1] clusts.pop(i) centroids.pop(i) mats.pop(i) maps.pop(i) for j in range(0, len(pClusts)): clusts.append(pClusts[j]) centroids.append(pCents[j]) newMat = [] newMap = {} st.redoMatrix(clusts, len(clusts) - 1, newMat, newMap) mats.append(newMat) maps.append(newMap)
def create_covariance_matrix(use_file, students, file, verbose=False): ''' Reads the data from the file (if we need to fix how the data is read, change clustering init.) Preprocesses data with one hot encoding (changes categorical variables into numerical.) Fixes matrix if it's not positive semidefinite (adds a small version of the identity.) Returns (data, covariance matrix.) Parameters ---------- use_file: indicates if we want to use the input from the file (bool). students: students to include in calculation (Student list). file: file to use (if use_file = True). Returns -------- covariance_matrix: the covariance matrix of the data (either from file or students). ''' if (use_file): data_array_tup = clustering.__init__(file) # Create covariance matrix from students themselves. else: multi_array = [] for student in students: attributes = student.get_numerical_student_properties() multi_array.append(attributes) IDs = [s.ID for s in students] data_array = np.array(multi_array) if (verbose): print "Multi array is " + str(data_array) data_array_tup = (data_array, IDs) data_array = data_array_tup[0] one_hot_data_preprocessed_tup = clustering.do_preprocessing(data_array_tup) one_hot_data_preprocessed = one_hot_data_preprocessed_tup[0] dict_key_vals = one_hot_data_preprocessed_tup[1] if (verbose): print "One hot data preprocessed is: " print one_hot_data_preprocessed print one_hot_data_preprocessed.shape # rowvar = 0 because each column represents a variable, while the rows are observations covariance_matrix = np.cov(one_hot_data_preprocessed, rowvar=0) if (verbose): print "Covariance matrix is:" print covariance_matrix shape = covariance_matrix.shape num_rows = shape[0] num_cols = shape[1] # Should never happen if (not (num_rows == num_cols)): raise DistanceError("Covariance matrix is not a square matrix.") else: if (is_positive_semidefinite(covariance_matrix)): if (verbose): print "Pos semi def on the first try!" pass # Our covariance matrix is not positive semidefinite -- an arithmetic error. # Will add (a small number * the identity matrix) to covariance matrix to fix this error. else: if (verbose): print "Not pos semi def on the first try!" n = num_rows i = np.array(np.identity(n)) factor = 10.**-10 # Create a matrix that is a small number times the identity. small_identity = np.multiply(i, factor) # Add that matrix to our covariance matrix (to make sure that our matrix is positive semidefinite.) result = np.add(small_identity, covariance_matrix) if (not (is_positive_semidefinite(result))): raise DistanceError( "Fixed covariance matrix is not positive semidefinite.") else: covariance_matrix = result return (data_array, one_hot_data_preprocessed, covariance_matrix, dict_key_vals)
def create_covariance_matrix(use_file, students, file, verbose = False): ''' Reads the data from the file (if we need to fix how the data is read, change clustering init.) Preprocesses data with one hot encoding (changes categorical variables into numerical.) Fixes matrix if it's not positive semidefinite (adds a small version of the identity.) Returns (data, covariance matrix.) Parameters ---------- use_file: indicates if we want to use the input from the file (bool). students: students to include in calculation (Student list). file: file to use (if use_file = True). Returns -------- covariance_matrix: the covariance matrix of the data (either from file or students). ''' if (use_file): data_array_tup = clustering.__init__(file) # Create covariance matrix from students themselves. else: multi_array = [] for student in students: attributes = student.get_numerical_student_properties() multi_array.append(attributes) IDs = [s.ID for s in students] data_array = np.array(multi_array) if (verbose): print "Multi array is " + str(data_array) data_array_tup = (data_array, IDs) data_array = data_array_tup[0] one_hot_data_preprocessed_tup = clustering.do_preprocessing(data_array_tup) one_hot_data_preprocessed = one_hot_data_preprocessed_tup[0] dict_key_vals = one_hot_data_preprocessed_tup[1] if (verbose): print "One hot data preprocessed is: " print one_hot_data_preprocessed print one_hot_data_preprocessed.shape # rowvar = 0 because each column represents a variable, while the rows are observations covariance_matrix = np.cov(one_hot_data_preprocessed, rowvar = 0) if (verbose): print "Covariance matrix is:" print covariance_matrix shape = covariance_matrix.shape num_rows = shape[0] num_cols = shape[1] # Should never happen if (not(num_rows == num_cols)): raise DistanceError("Covariance matrix is not a square matrix.") else: if (is_positive_semidefinite(covariance_matrix)): if (verbose): print "Pos semi def on the first try!" pass # Our covariance matrix is not positive semidefinite -- an arithmetic error. # Will add (a small number * the identity matrix) to covariance matrix to fix this error. else: if (verbose): print "Not pos semi def on the first try!" n = num_rows i = np.array(np.identity(n)) factor = 10. ** -10 # Create a matrix that is a small number times the identity. small_identity = np.multiply(i, factor) # Add that matrix to our covariance matrix (to make sure that our matrix is positive semidefinite.) result = np.add(small_identity, covariance_matrix) if (not(is_positive_semidefinite(result))): raise DistanceError("Fixed covariance matrix is not positive semidefinite.") else: covariance_matrix = result return (data_array, one_hot_data_preprocessed, covariance_matrix, dict_key_vals)
def run(names): global products products = p.products # indexes 0 and 1 results = [names, c.customersMap] global transpose transpose = c.matrix.transpose() cl.__init__(transpose, p.products) catNum = len(p.products)/8 + 1 outputs = cl.kMeans(catNum,8) productClusters = outputs[0] centroids = outputs[1] inputs = st.subMatrices(productClusters) productClusters = n.normalizeProdClusters(productClusters, centroids, inputs[0], inputs[1], 0.2, 0.4) # index 2 results.append(productClusters) # index 3 results.append(p.productsMap) # index 4 results.append(products) inputs = st.subMatrices(productClusters) subMats = inputs[0] maps = inputs[1] indexMap = inputs[2] subClustersHelpers = [] for i in range(0, len(subMats)): subCluster = st.createSubclustersHelpers(indexMap[i], subMats[i], maps[i]) subCluster.append(r.buildRecommendations(names, [subCluster])) subClustersHelpers.append(subCluster) customerClustersHelpers = st.createSubclustersHelpers(p.products, c.matrix, p.productsMap) customerClustersHelpers.append(r.buildRecommendations(names,[customerClustersHelpers])) powerClustersHelpers = [] powerI = [] powerCount = 0 productClusterLocator = [] for i in range(0, len(subClustersHelpers)): if subClustersHelpers[i][5] >= customerClustersHelpers[5]: powerClustersHelpers.append(subClustersHelpers[i]) powerI.append(i) productClusterLocator.append(['power', powerCount]) powerCount += 1 else: productClusterLocator.append(['sub', i - powerCount]) if(len(powerClustersHelpers) == 0): return 'again' displacement = 0 for i in range(0,len(powerI)): subClustersHelpers.pop(powerI[i]-displacement) displacement += 1 powerRecMatrix = r.buildRecommendations(names, powerClustersHelpers) # index 5 results.append(powerRecMatrix) # index 6 results.append([customerClustersHelpers]) # index 7 results.append(subClustersHelpers) # index 8 results.append(powerClustersHelpers) # index 9 results.append(c.matrix) # index 10 productClustersMap = st.createClusterMap(productClusters) results.append(productClustersMap) results.append(productClusterLocator) return results