def averageCost(data, costF_idx, medoids_idx, cacheOn=False): ''' Compute the average cost of medoids based on certain cost function and do the clustering ''' # Init the cluster size = len(data) total_cost = {} medoids = {} for idx in medoids_idx: medoids[idx] = [] total_cost[idx] = 0.0 # Compute the distance and do the clustering for i in range(size): choice = -1 # Make a big number min_cost = float('inf') for m in medoids: if cacheOn == True: # Check for cache tmp = distances_cache.get((m,i), None) if cacheOn == False or tmp == None: if costF_idx == 0: # euclidean_distance tmp = euclidean_distance(data[m], data[i]) elif costF_idx == 1: # manhattan_distance tmp = manhattan_distance(data[m], data[i]) elif costF_idx == 2: # pearson_distance tmp = pearson_distance(data[m], data[i]) else: print('Error: unknown cost function idx: ' % (costF_idx)) if cacheOn == True: # Save the distance for acceleration distances_cache[(m,i)] = tmp # Clustering if tmp < min_cost: choice = m min_cost = tmp # Done the clustering medoids[choice].append(i) total_cost[choice] += min_cost # Compute the average cost avg_cost = 0.0 for idx in medoids_idx: avg_cost += total_cost[idx] / len(medoids[idx]) # Return the average cost and clustering return(avg_cost, medoids)
def targetFunction(data, costF_idx, medoids_idx, cacheOn=False, distDict={}, simDict={}, affinities={}, costType=CostType, namedPoints=True): ''' Compute the average cost of medoids based on certain cost function and do the clustering given the medoids ''' if costType not in ["total", "average", "modularity"]: print "unknown target function - check the global variables in the code" return(1) # Init the cluster size = len(data) total_cost = {} medoids = {} for idx in medoids_idx: medoids[idx] = [] total_cost[idx] = 0.0 assignErrors = [] # Compute the distance and do the clustering for i in range(size): choice = -1 # Make a big number min_cost = float('inf') # medoids themselves are also included into resulting cluster lists for m in medoids: if cacheOn == True: # Check for cache tmp = distances_cache.get((m,i), None) if cacheOn == False or tmp == None: if costF_idx == 0: # euclidean_distance tmp = euclidean_distance(data[m], data[i]) elif costF_idx == 1: # manhattan_distance tmp = manhattan_distance(data[m], data[i]) elif costF_idx == 2: # pearson_distance tmp = pearson_distance(data[m], data[i]) elif costF_idx == 3: # direct_distance tmp = direct_distance(data[m], data[i], distDict) elif costF_idx == 4: # similarity_distance tmp = similarity_distance(data[m], data[i], simDict) else: print('Error: unknown cost function idx: ' % (costF_idx)) if cacheOn == True: # Save the distance for acceleration distances_cache[(m,i)] = tmp # Clustering # Randomization for nodes/points isolated from all the medoids # in order to assign them to random clusters. Hope averaging will # be able to glean cases for which some medoids did appear in the # same connected component, and group those nodes together. if tmp==0.0 and min_cost==0.0: # no connection to either medoid rv = bernoulli.rvs(1./len(medoids_idx), size=1) if rv[0]==1.: choice = m elif tmp < min_cost: #if tmp < min_cost: choice = m min_cost = tmp # Done the clustering if choice == -1: print "ERROR: the node cannot be assigned" assignErrors.append(i) else: medoids[choice].append(i) total_cost[choice] += min_cost # Compute the target function if costType == "total": #print total_cost return(sum(total_cost.values()), medoids) elif costType == "average": # Compute the average cost avg_cost = 0.0 for idx in medoids_idx: avg_cost += total_cost[idx] / len(medoids[idx]) # Return the average cost and clustering return(avg_cost, medoids) elif costType == "modularity": # If the points are named, display the names if namedPoints == True: named_medoids = {} for medID in medoids_idx: named_medoids[data[medID]] = [] for pointID in medoids[medID]: named_medoids[data[medID]].append(data[pointID]) # "-" because we maximize modularity mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=named_medoids) else: mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=medoids) print "modularity computed" else: print "unknown target function" return(1) if len(assignErrors) > 0: print "unassigned nodes: ", assignErrors else: print "no unassigned nodes, all right" return(mod, medoids)
def targetFunction(data, costF_idx, medoids_idx, cacheOn=False, distDict={}, simDict={}, affinities={}, costType=CostType, namedPoints=True): ''' Compute the average cost of medoids based on certain cost function and do the clustering given the medoids ''' if costType not in ["total", "average", "modularity"]: print "unknown target function - check the global variables in the code" return (1) # Init the cluster size = len(data) total_cost = {} medoids = {} for idx in medoids_idx: medoids[idx] = [] total_cost[idx] = 0.0 assignErrors = [] # Compute the distance and do the clustering for i in range(size): choice = -1 # Make a big number min_cost = float('inf') # medoids themselves are also included into resulting cluster lists for m in medoids: if cacheOn == True: # Check for cache tmp = distances_cache.get((m, i), None) if cacheOn == False or tmp == None: if costF_idx == 0: # euclidean_distance tmp = euclidean_distance(data[m], data[i]) elif costF_idx == 1: # manhattan_distance tmp = manhattan_distance(data[m], data[i]) elif costF_idx == 2: # pearson_distance tmp = pearson_distance(data[m], data[i]) elif costF_idx == 3: # direct_distance tmp = direct_distance(data[m], data[i], distDict) elif costF_idx == 4: # similarity_distance tmp = similarity_distance(data[m], data[i], simDict) else: print('Error: unknown cost function idx: ' % (costF_idx)) if cacheOn == True: # Save the distance for acceleration distances_cache[(m, i)] = tmp # Clustering # Randomization for nodes/points isolated from all the medoids # in order to assign them to random clusters. Hope averaging will # be able to glean cases for which some medoids did appear in the # same connected component, and group those nodes together. if tmp == 0.0 and min_cost == 0.0: # no connection to either medoid rv = bernoulli.rvs(1. / len(medoids_idx), size=1) if rv[0] == 1.: choice = m elif tmp < min_cost: #if tmp < min_cost: choice = m min_cost = tmp # Done the clustering if choice == -1: print "ERROR: the node cannot be assigned" assignErrors.append(i) else: medoids[choice].append(i) total_cost[choice] += min_cost # Compute the target function if costType == "total": #print total_cost return (sum(total_cost.values()), medoids) elif costType == "average": # Compute the average cost avg_cost = 0.0 for idx in medoids_idx: avg_cost += total_cost[idx] / len(medoids[idx]) # Return the average cost and clustering return (avg_cost, medoids) elif costType == "modularity": # If the points are named, display the names if namedPoints == True: named_medoids = {} for medID in medoids_idx: named_medoids[data[medID]] = [] for pointID in medoids[medID]: named_medoids[data[medID]].append(data[pointID]) # "-" because we maximize modularity mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=named_medoids) else: mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=medoids) print "modularity computed" else: print "unknown target function" return (1) if len(assignErrors) > 0: print "unassigned nodes: ", assignErrors else: print "no unassigned nodes, all right" return (mod, medoids)
def totalCost(data, costF_idx, medoids_idx, cacheOn=CacheOn, distDict={}, simDict={}, acceleration=0): ''' Compute the total cost and do the clustering based on certain cost function (that is, assign each data point to certain cluster given the medoids) ''' # Init the cluster size = len(data) total_cost = 0.0 medoids = {} for idx in medoids_idx: medoids[idx] = [] # medoids['unassigned'] = [] unassigned = [] tmp = None # Compute the distance and do the clustering for i in xrange(size): choice = -1 # Make a big number min_cost = float('inf') for m in medoids: if cacheOn == True: # Check for cache tmp = distances_cache.get((m, i), None) if cacheOn == False or tmp == None: if costF_idx == 0: # euclidean_distance tmp = euclidean_distance(data[m], data[i]) elif costF_idx == 1: # manhattan_distance tmp = manhattan_distance(data[m], data[i]) elif costF_idx == 2: # pearson_distance tmp = pearson_distance(data[m], data[i]) elif costF_idx == 3: # direct_distance tmp = direct_distance(data[m], data[i], distDict) elif costF_idx == 4: # similarity_distance try: tmp = similarity_distance(data[m], data[i], simDict) except: print m, i print data[m] print data[i] else: print('Error: unknown cost function idx: %d' % (costF_idx)) if cacheOn == True: # Save the distance for acceleration distances_cache[(m, i)] = tmp # Clustering if tmp < min_cost: choice = m min_cost = tmp # Done the clustering if min_cost == 0: # 0 similarity to all the medoids unassigned.append(i) # medoids['unassigned'].append(i) else: medoids[choice].append(i) total_cost += min_cost if acceleration == 2: transformed_medoids = {} #dict(medoids) for i, m in enumerate(medoids.keys()): #print i, m transformed_medoids[str(i)] = {'med': m, 'nodes': medoids[m]} #transformed_medoids[i] = transformed_medoids.pop(m) return (total_cost, transformed_medoids) # Return the total cost and clustering return (total_cost, medoids)
def totalCost(data, costF_idx, medoids_idx, cacheOn=CacheOn, distDict={}, simDict={}, acceleration=0): ''' Compute the total cost and do the clustering based on certain cost function (that is, assign each data point to certain cluster given the medoids) ''' # Init the cluster size = len(data) total_cost = 0.0 medoids = {} for idx in medoids_idx: medoids[idx] = [] # medoids['unassigned'] = [] unassigned = [] tmp = None # Compute the distance and do the clustering for i in xrange(size): choice = -1 # Make a big number min_cost = float('inf') for m in medoids: if cacheOn == True: # Check for cache tmp = distances_cache.get((m, i), None) if cacheOn == False or tmp == None: if costF_idx == 0: # euclidean_distance tmp = euclidean_distance(data[m], data[i]) elif costF_idx == 1: # manhattan_distance tmp = manhattan_distance(data[m], data[i]) elif costF_idx == 2: # pearson_distance tmp = pearson_distance(data[m], data[i]) elif costF_idx == 3: # direct_distance tmp = direct_distance(data[m], data[i], distDict) elif costF_idx == 4: # similarity_distance try: tmp = similarity_distance(data[m], data[i], simDict) except: print m, i print data[m] print data[i] else: print('Error: unknown cost function idx: %d' % (costF_idx)) if cacheOn == True: # Save the distance for acceleration distances_cache[(m, i)] = tmp # Clustering if tmp < min_cost: choice = m min_cost = tmp # Done the clustering if min_cost == 0: # 0 similarity to all the medoids unassigned.append(i) # medoids['unassigned'].append(i) else: medoids[choice].append(i) total_cost += min_cost if acceleration == 2: transformed_medoids = {} #dict(medoids) for i, m in enumerate(medoids.keys()): #print i, m transformed_medoids[str(i)] = {'med': m, 'nodes': medoids[m]} #transformed_medoids[i] = transformed_medoids.pop(m) return (total_cost, transformed_medoids) # Return the total cost and clustering return (total_cost, medoids )