Ejemplo n.º 1
0
def averageCost(data, costF_idx, medoids_idx, cacheOn=False):
	'''
	Compute the average cost of medoids based on certain cost function and do the clustering
	'''
	# Init the cluster
	size = len(data)
	total_cost = {}
	medoids = {}
	for idx in medoids_idx:
		medoids[idx] = []
		total_cost[idx] = 0.0

	# Compute the distance and do the clustering
	for i in range(size):
		choice = -1
		# Make a big number
		min_cost = float('inf')
		for m in medoids:
			if cacheOn == True:
				# Check for cache
				tmp = distances_cache.get((m,i), None)
			if cacheOn == False or tmp == None:
				if costF_idx == 0:
					# euclidean_distance
					tmp = euclidean_distance(data[m], data[i])
				elif costF_idx == 1:
					# manhattan_distance
					tmp = manhattan_distance(data[m], data[i])
				elif costF_idx == 2:
					# pearson_distance
					tmp = pearson_distance(data[m], data[i])
				else:
					print('Error: unknown cost function idx: ' % (costF_idx))
			if cacheOn == True:
				# Save the distance for acceleration
				distances_cache[(m,i)] = tmp
			# Clustering
			if tmp < min_cost:
				choice = m
				min_cost = tmp
		# Done the clustering
		medoids[choice].append(i)
		total_cost[choice] += min_cost

	# Compute the average cost
	avg_cost = 0.0
	for idx in medoids_idx:
		avg_cost += total_cost[idx] / len(medoids[idx])

	# Return the average cost and clustering
	return(avg_cost, medoids)
Ejemplo n.º 2
0
def averageCost(data, costF_idx, medoids_idx, cacheOn=False):
	'''
	Compute the average cost of medoids based on certain cost function and do the clustering
	'''
	# Init the cluster
	size = len(data)
	total_cost = {}
	medoids = {}
	for idx in medoids_idx:
		medoids[idx] = []
		total_cost[idx] = 0.0

	# Compute the distance and do the clustering
	for i in range(size):
		choice = -1
		# Make a big number
		min_cost = float('inf')
		for m in medoids:
			if cacheOn == True:
				# Check for cache
				tmp = distances_cache.get((m,i), None)
			if cacheOn == False or tmp == None:
				if costF_idx == 0:
					# euclidean_distance
					tmp = euclidean_distance(data[m], data[i])
				elif costF_idx == 1:
					# manhattan_distance
					tmp = manhattan_distance(data[m], data[i])
				elif costF_idx == 2:
					# pearson_distance
					tmp = pearson_distance(data[m], data[i])
				else:
					print('Error: unknown cost function idx: ' % (costF_idx))
			if cacheOn == True:
				# Save the distance for acceleration
				distances_cache[(m,i)] = tmp
			# Clustering
			if tmp < min_cost:
				choice = m
				min_cost = tmp
		# Done the clustering
		medoids[choice].append(i)
		total_cost[choice] += min_cost

	# Compute the average cost
	avg_cost = 0.0
	for idx in medoids_idx:
		avg_cost += total_cost[idx] / len(medoids[idx])

	# Return the average cost and clustering
	return(avg_cost, medoids)
Ejemplo n.º 3
0
def targetFunction(data, costF_idx, medoids_idx, cacheOn=False, distDict={},
                   simDict={}, affinities={}, costType=CostType,
                   namedPoints=True):
    '''
    Compute the average cost of medoids based on certain cost function
    and do the clustering given the medoids
    '''
    if costType not in ["total", "average", "modularity"]:
        print "unknown target function - check the global variables in the code"
        return(1)

    # Init the cluster
    size = len(data)
    total_cost = {}
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
        total_cost[idx] = 0.0
    assignErrors = []

    # Compute the distance and do the clustering
    for i in range(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        # medoids themselves are also included into resulting cluster lists
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m,i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    tmp = similarity_distance(data[m], data[i], simDict)
                else:
                    print('Error: unknown cost function idx: ' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m,i)] = tmp
            # Clustering

            # Randomization for nodes/points isolated from all the medoids
            # in order to assign them to random clusters. Hope averaging will
            # be able to glean cases for which some medoids did appear in the
            # same connected component, and group those nodes together.
            if tmp==0.0 and min_cost==0.0: # no connection to either medoid
                rv = bernoulli.rvs(1./len(medoids_idx), size=1)
                if rv[0]==1.: choice = m
            elif tmp < min_cost:
                #if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if choice == -1:
            print "ERROR: the node cannot be assigned"
            assignErrors.append(i)
        else:
            medoids[choice].append(i)
            total_cost[choice] += min_cost

    # Compute the target function
    if costType == "total":
        #print total_cost
        return(sum(total_cost.values()), medoids)

    elif costType == "average":
    # Compute the average cost
        avg_cost = 0.0
        for idx in medoids_idx:
            avg_cost += total_cost[idx] / len(medoids[idx])
        # Return the average cost and clustering
        return(avg_cost, medoids)

    elif costType == "modularity":
        # If the points are named, display the names
        if namedPoints == True:
            named_medoids = {}
            for medID in medoids_idx:
                named_medoids[data[medID]] = []
                for pointID in medoids[medID]:
                    named_medoids[data[medID]].append(data[pointID])
            # "-" because we maximize modularity
            mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=named_medoids)
        else:
            mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=medoids)
        print "modularity computed"

    else:
        print "unknown target function"
        return(1)

    if len(assignErrors) > 0:
        print "unassigned nodes: ", assignErrors
    else:
        print "no unassigned nodes, all right"

    return(mod, medoids)
Ejemplo n.º 4
0
def targetFunction(data,
                   costF_idx,
                   medoids_idx,
                   cacheOn=False,
                   distDict={},
                   simDict={},
                   affinities={},
                   costType=CostType,
                   namedPoints=True):
    '''
    Compute the average cost of medoids based on certain cost function
    and do the clustering given the medoids
    '''
    if costType not in ["total", "average", "modularity"]:
        print "unknown target function - check the global variables in the code"
        return (1)

    # Init the cluster
    size = len(data)
    total_cost = {}
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
        total_cost[idx] = 0.0
    assignErrors = []

    # Compute the distance and do the clustering
    for i in range(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        # medoids themselves are also included into resulting cluster lists
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m, i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    tmp = similarity_distance(data[m], data[i], simDict)
                else:
                    print('Error: unknown cost function idx: ' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m, i)] = tmp
            # Clustering

            # Randomization for nodes/points isolated from all the medoids
            # in order to assign them to random clusters. Hope averaging will
            # be able to glean cases for which some medoids did appear in the
            # same connected component, and group those nodes together.
            if tmp == 0.0 and min_cost == 0.0:  # no connection to either medoid
                rv = bernoulli.rvs(1. / len(medoids_idx), size=1)
                if rv[0] == 1.: choice = m
            elif tmp < min_cost:
                #if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if choice == -1:
            print "ERROR: the node cannot be assigned"
            assignErrors.append(i)
        else:
            medoids[choice].append(i)
            total_cost[choice] += min_cost

    # Compute the target function
    if costType == "total":
        #print total_cost
        return (sum(total_cost.values()), medoids)

    elif costType == "average":
        # Compute the average cost
        avg_cost = 0.0
        for idx in medoids_idx:
            avg_cost += total_cost[idx] / len(medoids[idx])
        # Return the average cost and clustering
        return (avg_cost, medoids)

    elif costType == "modularity":
        # If the points are named, display the names
        if namedPoints == True:
            named_medoids = {}
            for medID in medoids_idx:
                named_medoids[data[medID]] = []
                for pointID in medoids[medID]:
                    named_medoids[data[medID]].append(data[pointID])
            # "-" because we maximize modularity
            mod = -modularity(data,
                              COST=costF_idx,
                              distDict=distDict,
                              edgeDict=affinities,
                              medoids=named_medoids)
        else:
            mod = -modularity(data,
                              COST=costF_idx,
                              distDict=distDict,
                              edgeDict=affinities,
                              medoids=medoids)
        print "modularity computed"

    else:
        print "unknown target function"
        return (1)

    if len(assignErrors) > 0:
        print "unassigned nodes: ", assignErrors
    else:
        print "no unassigned nodes, all right"

    return (mod, medoids)
Ejemplo n.º 5
0
def totalCost(data,
              costF_idx,
              medoids_idx,
              cacheOn=CacheOn,
              distDict={},
              simDict={},
              acceleration=0):
    '''
    Compute the total cost and do the clustering based on certain cost function
    (that is, assign each data point to certain cluster given the medoids)
    '''
    # Init the cluster
    size = len(data)
    total_cost = 0.0
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
    # medoids['unassigned'] = []
    unassigned = []
    tmp = None

    # Compute the distance and do the clustering
    for i in xrange(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m, i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    try:
                        tmp = similarity_distance(data[m], data[i], simDict)
                    except:
                        print m, i
                        print data[m]
                        print data[i]
                else:
                    print('Error: unknown cost function idx: %d' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m, i)] = tmp
            # Clustering
            if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if min_cost == 0:  # 0 similarity to all the medoids
            unassigned.append(i)  # medoids['unassigned'].append(i)
        else:
            medoids[choice].append(i)
        total_cost += min_cost

    if acceleration == 2:
        transformed_medoids = {}  #dict(medoids)
        for i, m in enumerate(medoids.keys()):
            #print i, m
            transformed_medoids[str(i)] = {'med': m, 'nodes': medoids[m]}
            #transformed_medoids[i] = transformed_medoids.pop(m)
        return (total_cost, transformed_medoids)

    # Return the total cost and clustering
    return (total_cost, medoids)
Ejemplo n.º 6
0
def totalCost(data, costF_idx, medoids_idx, cacheOn=CacheOn, distDict={}, simDict={}, acceleration=0):
    '''
    Compute the total cost and do the clustering based on certain cost function
    (that is, assign each data point to certain cluster given the medoids)
    '''
    # Init the cluster
    size = len(data)
    total_cost = 0.0
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
    # medoids['unassigned'] = []
    unassigned = []
    tmp = None

    # Compute the distance and do the clustering
    for i in xrange(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m, i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    try:
                        tmp = similarity_distance(data[m], data[i], simDict)
                    except:
                        print m, i
                        print data[m]
                        print data[i]
                else:
                    print('Error: unknown cost function idx: %d' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m, i)] = tmp
            # Clustering
            if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if min_cost == 0:  # 0 similarity to all the medoids
            unassigned.append(i)  # medoids['unassigned'].append(i)
        else:
            medoids[choice].append(i)
        total_cost += min_cost

    if acceleration == 2:
        transformed_medoids = {} #dict(medoids)
        for i, m in enumerate(medoids.keys()):
            #print i, m
            transformed_medoids[str(i)] = {'med': m, 'nodes': medoids[m]}
            #transformed_medoids[i] = transformed_medoids.pop(m)
        return (total_cost, transformed_medoids)

    # Return the total cost and clustering
    return (total_cost, medoids )