Ejemplo n.º 1
0
def buildProductClusters():
    global transpose
    transpose = c.matrix.transpose()
    cl.__init__(transpose, p.products)
    catNum = len(p.products)/8 + 1
    outputs = cl.kMeans(catNum,8)
    return outputs
Ejemplo n.º 2
0
def createSubcluster(indexMap, subMatrix, aMap):
    cl.__init__(subMatrix, c.customers, aMap)
    clust = []
    results = cl.kMeans(25,8)
    clusters = results[0]
    clust.append(clusters)
    centroids = results[1]
    clust.append(centroids)
    clust.append(cl.clusterMap)
    clust.append(indexMap)
    clust.append(s.averageSilhouettes(clust[0], subMatrix))
    clust.append(aMap)
    return clust
Ejemplo n.º 3
0
def run(names):
    global products
    products = p.products
    results = [names, c.customersMap]

    global transpose
    transpose = c.matrix.transpose()
    cl.__init__(transpose, p.products)
    catNum = len(p.products)/8 + 1
    outputs = cl.kMeans(catNum,8)
    prodClusters = outputs[0]
    centroids = outputs[1]

    inputs = st.subMatrices(prodClusters)
    prodClusters = n.normalizeProdClusters(prodClusters, centroids, inputs[0], inputs[1], 0.2, 0.4)
    results.append(prodClusters)

    inputs = st.subMatrices(prodClusters)
    subMats = inputs[0]
    maps = inputs[1]
    indexMap = inputs[2]

    subClusters = []
    for i in range(0, len(subMats)):
        subCluster = st.createSubcluster(indexMap[i], subMats[i], maps[i])
        subCluster.append(r.buildRecommendations(names, [subCluster]))
        subClusters.append(subCluster)


    totCluster = st.createSubcluster(p.products, c.matrix, p.productsMap)
    totCluster.append(r.buildRecommendations(names,[totCluster]))
    powerClusters = []
    powerSil = []
    results.append('unfiltered results: ' + str(totCluster[4]))
    for i in range(0, len(subClusters)):
        if subClusters[i][4] >= totCluster[4]:
            powerClusters.append(subClusters[i])
            powerSil.append(subClusters[i][4])
    if(len(powerSil) == 0):
        return 'again'
    else:
        results.append('filtered average: ' + str(sum(powerSil)/len(powerSil)))
    
    powerClusters.append(totCluster)


    recommendationMatrix = r.buildRecommendations(names, powerClusters)
    results.append(recommendationMatrix)
    results.append(powerClusters)
    results.append(subClusters)
    return results
Ejemplo n.º 4
0
def createClusterHelpers(indexMap, subMatrix, aMap):
    cl.__init__(subMatrix, c.customers, aMap)
    clust = []
    results = cl.kMeans(25,8)
    clusters = results[0]
    # index 0
    clust.append(clusters)
    centroids = results[1]
    # index 1
    clust.append(centroids)
    # index 2
    clust.append(cl.clusterMap)
    # index 3
    clust.append(indexMap)
    avgSils = s.averageSilhouettes(clust[0], subMatrix, centroids)
    # index 4
    clust.append(s.silhouettesList)
    # index 5
    clust.append(avgSils)
    return clust
Ejemplo n.º 5
0
def dissolve(clusts, centroids, mats, maps, i):
    trans = mats[i].transpose()
    cl.__init__(trans, clusts[i], maps[i])
    num = len(clusts[i])/8+1
    results = cl.kMeans(num, 20)

    pClusts = results[0]
    pCents = results[1]
    clusts.pop(i)
    centroids.pop(i)
    mats.pop(i)
    maps.pop(i)

    for j in range(0, len(pClusts)):
        clusts.append(pClusts[j])
        centroids.append(pCents[j])
        newMat = []
        newMap = {}
        st.redoMatrix(clusts,len(clusts)-1,newMat, newMap)
        mats.append(newMat)
        maps.append(newMap)
Ejemplo n.º 6
0
def dissolve(clusts, centroids, mats, maps, i):
    trans = mats[i].transpose()
    cl.__init__(trans, clusts[i], maps[i])
    num = len(clusts[i]) / 8 + 1
    results = cl.kMeans(num, 20)

    pClusts = results[0]
    pCents = results[1]
    clusts.pop(i)
    centroids.pop(i)
    mats.pop(i)
    maps.pop(i)

    for j in range(0, len(pClusts)):
        clusts.append(pClusts[j])
        centroids.append(pCents[j])
        newMat = []
        newMap = {}
        st.redoMatrix(clusts, len(clusts) - 1, newMat, newMap)
        mats.append(newMat)
        maps.append(newMap)
Ejemplo n.º 7
0
def create_covariance_matrix(use_file, students, file, verbose=False):
    '''
		Reads the data from the file (if we need to fix how the data is read, change clustering init.)
		Preprocesses data with one hot encoding (changes categorical variables into numerical.)
		Fixes matrix if it's not positive semidefinite (adds a small version of the identity.)
		Returns (data, covariance matrix.)

	    Parameters
	    ----------
	    use_file: indicates if we want to use the input from the file (bool).
	    students: students to include in calculation (Student list).
	    file: file to use (if use_file = True).

	    Returns
	    --------
	    covariance_matrix: the covariance matrix of the data (either from file or students).
	
	'''
    if (use_file):
        data_array_tup = clustering.__init__(file)
    # Create covariance matrix from students themselves.
    else:
        multi_array = []
        for student in students:
            attributes = student.get_numerical_student_properties()
            multi_array.append(attributes)
        IDs = [s.ID for s in students]
        data_array = np.array(multi_array)
        if (verbose):
            print "Multi array is " + str(data_array)
        data_array_tup = (data_array, IDs)

    data_array = data_array_tup[0]
    one_hot_data_preprocessed_tup = clustering.do_preprocessing(data_array_tup)

    one_hot_data_preprocessed = one_hot_data_preprocessed_tup[0]
    dict_key_vals = one_hot_data_preprocessed_tup[1]

    if (verbose):
        print "One hot data preprocessed is: "
        print one_hot_data_preprocessed
        print one_hot_data_preprocessed.shape

    # rowvar = 0 because each column represents a variable, while the rows are observations
    covariance_matrix = np.cov(one_hot_data_preprocessed, rowvar=0)
    if (verbose):
        print "Covariance matrix is:"
        print covariance_matrix
    shape = covariance_matrix.shape
    num_rows = shape[0]
    num_cols = shape[1]

    # Should never happen
    if (not (num_rows == num_cols)):
        raise DistanceError("Covariance matrix is not a square matrix.")

    else:
        if (is_positive_semidefinite(covariance_matrix)):
            if (verbose):
                print "Pos semi def on the first try!"
            pass
        # Our covariance matrix is not positive semidefinite -- an arithmetic error.
        # Will add (a small number * the identity matrix) to covariance matrix to fix this error.
        else:
            if (verbose):
                print "Not pos semi def on the first try!"
            n = num_rows
            i = np.array(np.identity(n))
            factor = 10.**-10
            # Create a matrix that is a small number times the identity.
            small_identity = np.multiply(i, factor)

            # Add that matrix to our covariance matrix (to make sure that our matrix is positive semidefinite.)
            result = np.add(small_identity, covariance_matrix)
            if (not (is_positive_semidefinite(result))):
                raise DistanceError(
                    "Fixed covariance matrix is not positive semidefinite.")
            else:
                covariance_matrix = result

    return (data_array, one_hot_data_preprocessed, covariance_matrix,
            dict_key_vals)
def create_covariance_matrix(use_file, students, file, verbose = False):
	'''
		Reads the data from the file (if we need to fix how the data is read, change clustering init.)
		Preprocesses data with one hot encoding (changes categorical variables into numerical.)
		Fixes matrix if it's not positive semidefinite (adds a small version of the identity.)
		Returns (data, covariance matrix.)

	    Parameters
	    ----------
	    use_file: indicates if we want to use the input from the file (bool).
	    students: students to include in calculation (Student list).
	    file: file to use (if use_file = True).

	    Returns
	    --------
	    covariance_matrix: the covariance matrix of the data (either from file or students).
	
	'''
	if (use_file):
		data_array_tup = clustering.__init__(file)
	# Create covariance matrix from students themselves.
	else:
		multi_array = []
		for student in students:
 			attributes = student.get_numerical_student_properties()
 			multi_array.append(attributes)
 		IDs = [s.ID for s in students]
 		data_array = np.array(multi_array)
 		if (verbose):
	 		print "Multi array is " + str(data_array)
 		data_array_tup = (data_array, IDs)

	data_array = data_array_tup[0]
	one_hot_data_preprocessed_tup = clustering.do_preprocessing(data_array_tup)

	one_hot_data_preprocessed = one_hot_data_preprocessed_tup[0]
	dict_key_vals = one_hot_data_preprocessed_tup[1]
	
	if (verbose):
		print "One hot data preprocessed is: "
		print one_hot_data_preprocessed
		print one_hot_data_preprocessed.shape

	# rowvar = 0 because each column represents a variable, while the rows are observations
	covariance_matrix = np.cov(one_hot_data_preprocessed, rowvar = 0)
	if (verbose):
		print "Covariance matrix is:"
		print covariance_matrix
	shape = covariance_matrix.shape
	num_rows = shape[0]
	num_cols = shape[1]
	
	# Should never happen
	if (not(num_rows == num_cols)):
		raise DistanceError("Covariance matrix is not a square matrix.")

	else:
		if (is_positive_semidefinite(covariance_matrix)):
			if (verbose):
				print "Pos semi def on the first try!"
			pass		
		# Our covariance matrix is not positive semidefinite -- an arithmetic error.
		# Will add (a small number * the identity matrix) to covariance matrix to fix this error.
		else:
			if (verbose):
				print "Not pos semi def on the first try!"
			n = num_rows
			i = np.array(np.identity(n))
			factor = 10. ** -10
			# Create a matrix that is a small number times the identity.
			small_identity = np.multiply(i, factor)

			# Add that matrix to our covariance matrix (to make sure that our matrix is positive semidefinite.)
			result = np.add(small_identity, covariance_matrix)
			if (not(is_positive_semidefinite(result))):
				raise DistanceError("Fixed covariance matrix is not positive semidefinite.")
			else:
				covariance_matrix = result

	return (data_array, one_hot_data_preprocessed, covariance_matrix, dict_key_vals)
Ejemplo n.º 9
0
def run(names):
    global products
    products = p.products
    # indexes 0 and 1
    results = [names, c.customersMap]

    global transpose
    transpose = c.matrix.transpose()
    cl.__init__(transpose, p.products)
    catNum = len(p.products)/8 + 1
    outputs = cl.kMeans(catNum,8)
    productClusters = outputs[0]
    centroids = outputs[1]

    inputs = st.subMatrices(productClusters)
    productClusters = n.normalizeProdClusters(productClusters, centroids, inputs[0], inputs[1], 0.2, 0.4)
    # index 2
    results.append(productClusters)
    # index 3
    results.append(p.productsMap)
    # index 4
    results.append(products)

    inputs = st.subMatrices(productClusters)
    subMats = inputs[0]
    maps = inputs[1]
    indexMap = inputs[2]

    subClustersHelpers = []
    for i in range(0, len(subMats)):
        subCluster = st.createSubclustersHelpers(indexMap[i], subMats[i], maps[i])
        subCluster.append(r.buildRecommendations(names, [subCluster]))
        subClustersHelpers.append(subCluster)


    customerClustersHelpers = st.createSubclustersHelpers(p.products, c.matrix, p.productsMap)
    customerClustersHelpers.append(r.buildRecommendations(names,[customerClustersHelpers]))
    powerClustersHelpers = []
    powerI = []
    powerCount = 0
    productClusterLocator = []
    for i in range(0, len(subClustersHelpers)):
        if subClustersHelpers[i][5] >= customerClustersHelpers[5]:
            powerClustersHelpers.append(subClustersHelpers[i])
            powerI.append(i)
            productClusterLocator.append(['power', powerCount])
            powerCount += 1
        else:
            productClusterLocator.append(['sub', i - powerCount])
    if(len(powerClustersHelpers) == 0):
        return 'again'
    displacement = 0
    for i in range(0,len(powerI)):
        subClustersHelpers.pop(powerI[i]-displacement)
        displacement += 1

    powerRecMatrix = r.buildRecommendations(names, powerClustersHelpers)
    # index 5
    results.append(powerRecMatrix)
    # index 6
    results.append([customerClustersHelpers])
    # index 7
    results.append(subClustersHelpers)
    # index 8
    results.append(powerClustersHelpers)
    # index 9
    results.append(c.matrix)
    # index 10
    productClustersMap = st.createClusterMap(productClusters)
    results.append(productClustersMap)
    results.append(productClusterLocator)
    
    return results