Ejemplo n.º 1
 def calculate_distance(self):
     distance = 0
     excess_duration = 0
     for depot_id, routes in self.routes.items():
         depot_coordinate = self.depots[depot_id][0]
         max_duration = self.depots[depot_id][1]
         for route in routes:
             key = hash((depot_id, tuple(route)))
             if key in Chromosome.route_memo:
                 route_distance = Chromosome.route_memo[key]
                 distance += route_distance
                 if max_duration != 0:
                     if route_distance > max_duration:
                         excess_duration += route_distance - max_duration
                 trip = list(map(lambda x: self.customers[x][0], route))
                 trip.insert(0, depot_coordinate)
                 route_distance = 0
                 for i in range(len(trip) - 1):
                     route_distance += euclidean_distance(trip[i], trip[i + 1])
                 Chromosome.route_memo[key] = route_distance
                 distance += route_distance
                 if max_duration != 0:
                     if route_distance > max_duration:
                         excess_duration += route_distance - max_duration
     return distance, excess_duration
    def get_nearest_neighbhours(self, test_instance, training_data=None, k_neighbours=None):
        Computes the euclidian distances for the test instance with all training data instances, and returns
        the first K instances.
        :param test_instance: a 1*n feature vector to be the main operand in the euclidian distance calculations
        :param training_data: the feature vectors with their classes to have their distances compared
        :param k_neighbours: how many
        :return: a list of the closest K training items and their classes
        if k_neighbours is None:
            k_neighbours = 51

        if training_data is None:
            if self.training_data is None:
                raise ValueError("KNN Model has not been given with any training data to compute neighbours")
                training_data = self.training_data

        items_with_distances = []
        for item in training_data:
            dist = util.euclidean_distance(item[0], test_instance)
            items_with_distances.append({"item": item, "distance": dist})
        # We use a lambda to sort each tuple based on the value at its 'distance' key
        items_with_distances = sorted(items_with_distances, key=lambda k: k["distance"])
        if k_neighbours != None:
            return items_with_distances[:k_neighbours]
            return items_with_distances
Ejemplo n.º 3
    def cluster(self, vectors, assign_clusters=False,ClusterNum=None, DisType='euc',Stype='mean',trace=False):
        # stores the merge order

        self._distMap.clear()   # 每次聚类不同样本之前必须更新

        l = len(vectors)
            return []

            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = cosine_distance(vectors[i], vectors[j])
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = euclidean_distance(vectors[i], vectors[j])
        result = VectorSpaceClusterer.cluster(self, vectors,assign_clusters,ClusterNum, Stype, trace)

        #/////////////////////// 测试,输出距离 /////////////////
        # m = 0
        # for k,v in self._distMap:
        #     m +=1 
        #     print v,"\t",
        #     if (m%7==0):
        #         print

        if(2==len(vectors[0])):         # 二维样本则显示可视化结果
            draw_2D_cluster(vectors, result)

        return result
Ejemplo n.º 4
    def cluster(self,
        # stores the merge order

        self._distMap.clear()  # 每次聚类不同样本之前必须更新

        l = len(vectors)
        if ('cos' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = cosine_distance(
                        vectors[i], vectors[j])
        elif ('euc' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = euclidean_distance(
                        vectors[i], vectors[j])
        self._dendrogram = Dendrogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        result = VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                              ClusterNum, Stype, trace)

        if (2 == len(vectors[0])):  # 二维样本则显示可视化结果
            self.draw_2D(vectors, result)

        return result
Ejemplo n.º 5
    def __init__(self, position: VECTOR, radius: float, center: VECTOR):
        self.position = position
        self.radius = radius
        self.center = center

        self.orbit_radius = euclidean_distance(*position, *center)
        self.rotation_angle = 365 / (self.radius * self.orbit_radius)
        self.rotation_angle /= 10  # Arbitrary constant to slow things down
Ejemplo n.º 6
 def get_swap_cluster(self) -> Dict:
     cluster = defaultdict(lambda: (int, float("inf")))
     for customer_id in self.customers:
         customer_coordinate = self.customers[customer_id][0]
         for depot_id in self.depots:
             depot_coordinate = self.depots[depot_id][0]
             distance = euclidean_distance(customer_coordinate, depot_coordinate)
             if distance < cluster[customer_id][1]:
                 cluster[customer_id] = depot_id, distance
     swap_cluster = defaultdict(list)
     for customer_id in self.customers:
         customer_coordinate = self.customers[customer_id][0]
         for depot_id in self.depots:
             depot_coordinate = self.depots[depot_id][0]
             distance = euclidean_distance(customer_coordinate, depot_coordinate)
             if ((distance - cluster[customer_id][1]) / cluster[customer_id][1]) <= 2:
     return swap_cluster
Ejemplo n.º 7
def make_label(dim, radius):
    label = np.full((dim, dim), -1)
    center = int(dim / 2.0)
    start = center - ceil(radius)
    end = center + ceil(radius)
    for i in inclusive_range(start, end):
        for j in inclusive_range(start, end):
            if euclidean_distance(i, j, center, center) <= radius:
                label[i,j] = 1
    return label
Ejemplo n.º 8
 def get_neighbourhood(self, index: tensor, radius: float) -> List[Tuple]:
     result = []
     for i in range(self.output_rows):
         for j in range(self.output_cols):
             indices = (i, j)
             distance = euclidean_distance(index, tensor(list(indices)))
             if distance <= radius:
                 if not np.array_equal(index, tensor(list(indices))):
                     result.append((indices, distance))
     return result
Ejemplo n.º 9
def get_relational_features(target, landmark):
    t_point = (target['pos_x'],target['pos_y'])
    l_point = (landmark['pos_x'],landmark['pos_y'])
    x_diff = l_point[0] - t_point[0]
    y_diff = l_point[1] - t_point[1]
    distance = util.euclidean_distance(t_point, l_point)
    ab = 0 if y_diff < 0 else 1
    lr = 0 if x_diff > 0 else 1
    return {'ab':ab, 'lr':lr,'xdiff':x_diff, 'ydiff': y_diff,'dist':distance}
Ejemplo n.º 10
 def calculate_food_distance(self, game_state, pacman_position, food_position):
     problem = graphSearchProblem.PositionSearchProblem(game_state,
                                                        warn=False, visualize=False)
     path_to_food = graphSearchProblem.aStarSearch(problem)
     distance = len(path_to_food)
     distance = util.euclidean_distance(pacman_position, food_position)
     return distance
Ejemplo n.º 11
def averageCost(data, costF_idx, medoids_idx, cacheOn=False):
	Compute the average cost of medoids based on certain cost function and do the clustering
	# Init the cluster
	size = len(data)
	total_cost = {}
	medoids = {}
	for idx in medoids_idx:
		medoids[idx] = []
		total_cost[idx] = 0.0

	# Compute the distance and do the clustering
	for i in range(size):
		choice = -1
		# Make a big number
		min_cost = float('inf')
		for m in medoids:
			if cacheOn == True:
				# Check for cache
				tmp = distances_cache.get((m,i), None)
			if cacheOn == False or tmp == None:
				if costF_idx == 0:
					# euclidean_distance
					tmp = euclidean_distance(data[m], data[i])
				elif costF_idx == 1:
					# manhattan_distance
					tmp = manhattan_distance(data[m], data[i])
				elif costF_idx == 2:
					# pearson_distance
					tmp = pearson_distance(data[m], data[i])
					print('Error: unknown cost function idx: ' % (costF_idx))
			if cacheOn == True:
				# Save the distance for acceleration
				distances_cache[(m,i)] = tmp
			# Clustering
			if tmp < min_cost:
				choice = m
				min_cost = tmp
		# Done the clustering
		total_cost[choice] += min_cost

	# Compute the average cost
	avg_cost = 0.0
	for idx in medoids_idx:
		avg_cost += total_cost[idx] / len(medoids[idx])

	# Return the average cost and clustering
	return(avg_cost, medoids)
Ejemplo n.º 12
def averageCost(data, costF_idx, medoids_idx, cacheOn=False):
	Compute the average cost of medoids based on certain cost function and do the clustering
	# Init the cluster
	size = len(data)
	total_cost = {}
	medoids = {}
	for idx in medoids_idx:
		medoids[idx] = []
		total_cost[idx] = 0.0

	# Compute the distance and do the clustering
	for i in range(size):
		choice = -1
		# Make a big number
		min_cost = float('inf')
		for m in medoids:
			if cacheOn == True:
				# Check for cache
				tmp = distances_cache.get((m,i), None)
			if cacheOn == False or tmp == None:
				if costF_idx == 0:
					# euclidean_distance
					tmp = euclidean_distance(data[m], data[i])
				elif costF_idx == 1:
					# manhattan_distance
					tmp = manhattan_distance(data[m], data[i])
				elif costF_idx == 2:
					# pearson_distance
					tmp = pearson_distance(data[m], data[i])
					print('Error: unknown cost function idx: ' % (costF_idx))
			if cacheOn == True:
				# Save the distance for acceleration
				distances_cache[(m,i)] = tmp
			# Clustering
			if tmp < min_cost:
				choice = m
				min_cost = tmp
		# Done the clustering
		total_cost[choice] += min_cost

	# Compute the average cost
	avg_cost = 0.0
	for idx in medoids_idx:
		avg_cost += total_cost[idx] / len(medoids[idx])

	# Return the average cost and clustering
	return(avg_cost, medoids)
Ejemplo n.º 13
 def get_customer_cluster(self) -> Dict:
     cluster = defaultdict(list)
     for customer_id in self.customers:
         customer_coordinate = self.customers[customer_id][0]
         best_distance = float('inf')
         best_depot = None
         for depot_id in self.depots:
             depot_coordinate = self.depots[depot_id][0]
             distance = euclidean_distance(customer_coordinate, depot_coordinate)
             if distance < best_distance:
                 best_depot = depot_id
                 best_distance = distance
     return cluster
Ejemplo n.º 14
    def __get_neighbors(self, data, k):
        distances = []

        for i in range(len(self.train_x)):
            dist = euclidean_distance(data, self.train_x[i])
            distances.append((self.train_x[i] + [self.train_y[i]], dist))


        # return the first k neighbors with the smallest
        # distance
        neighbors = [distances[i][0] for i in range(k)]

        return neighbors
Ejemplo n.º 15
def cluster_points(points, cluster_dist=7):
    old_points = np.array(points)
    new_points = []

    while len(old_points) > 1:
        p1 = old_points[0]
        distances = np.array(
            [util.euclidean_distance(p1, p2) for p2 in old_points])
        idx = (distances < cluster_dist)
        points_cluster = old_points[idx]
        centroid = util.get_centroid(points_cluster)
        old_points = old_points[np.invert(idx)]

    return new_points
Ejemplo n.º 16
    def update_distance_from_car(self, car_pose):
        new_distance = util.euclidean_distance(self.x,

        if self.distance_from_car:
            if floats_equal(new_distance, self.distance_from_car):
                # No change in shift_relative_to_car
            elif new_distance < self.distance_from_car:
                self.shift_relative_to_car = LIGHT_GETTING_CLOSER
            elif new_distance > self.distance_from_car:
                self.shift_relative_to_car = LIGHT_GETTING_FARTHER

        self.distance_from_car = new_distance
Ejemplo n.º 17
    def cluster(self,
        # stores the merge order

        self._distMap.clear()  # 每次聚类不同样本之前必须更新

        l = len(vectors)
        if (0 == l):
            return []

        if ('cos' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = cosine_distance(
                        vectors[i], vectors[j])
        elif ('euc' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = euclidean_distance(
                        vectors[i], vectors[j])
        result = VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                              ClusterNum, Stype, trace)

        #/////////////////////// 测试,输出距离 /////////////////
        # m = 0
        # for k,v in self._distMap:
        #     m +=1
        #     print v,"\t",
        #     if (m%7==0):
        #         print

        if (2 == len(vectors[0])):  # 二维样本则显示可视化结果
            draw_2D_cluster(vectors, result)

        return result
Ejemplo n.º 18
    def get_features(self, data):
        for eid in data:
            row = data[eid]
            del row['episode_id']
            del row['position']
            del row['id']
#             row['v_top-skewed'] = 1 if row['v_skew'] == 'top-skewed' else 0
#             row['v_symmetric'] = 1 if row['v_skew'] == 'symmetric' else 0
#             row['v_bottom-skewed'] = 1 if row['v_skew'] == 'bottom-skewed' else 0
#             row['h_top-skewed'] = 1 if row['h_skew'] == 'right-skewed' else 0
#             row['h_symmetric'] = 1 if row['h_skew'] == 'symmetric' else 0
#             row['h_left-skewed'] = 1 if row['h_skew'] == 'left-skewed' else 0
            del row['v_skew']
            del row['h_skew']
            del row['orientation']
            row['c_diff'] = util.euclidean_distance((320,240), (row['pos_x'], row['pos_y'])) # distance from center

        return data
Ejemplo n.º 19
    def cluster(self, vectors, assign_clusters=False, DisType='cos',Stype='avg',trace=False):
        # stores the merge order

        self._distMap.clear()   # 每次聚类不同样本之前必须更新

        l = len(vectors)
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = cosine_distance(vectors[i], vectors[j])
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = euclidean_distance(vectors[i], vectors[j])
        self._dendrogram = Dendrogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        result = VectorSpaceClusterer.cluster(self, vectors,assign_clusters, Stype, trace)

        return result
Ejemplo n.º 20
    def cluster(self, matrix):
        l = len(matrix)
        self.distList = np.zeros((l,l),np.float)
        for i in range(l):
            self.distList[i][i] = float('inf')      # 自身不参与聚类比较
            for j in range(i+1,l):
                self.distList[i][j] = euclidean_distance(np.array(matrix[i]), np.array(matrix[j]))
                self.distList[j][i] = self.distList[i][j]
        mostSimList = []        # 记录与第 i 个样本最相似的前 m 个样本的距离
        m = 3
        marks = [i for i in range(l)]

        for i in range(l):
            lis = self.distList[i].tolist()
            lis = zip(marks,lis)
            mostSimList.append(sorted(lis, key=lambda x:x[1])[0:m])

        ADist = []
        for i in range(l):
            ADist = sorted(ADist)

        mostSimList = zip(marks,mostSimList)
        mostSimList = sorted(mostSimList, key=lambda x:x[1][m-1][1], reverse=True)

        noise = []
        for i in mostSimList[0:l/5]:

        print mostSimList
        print ADist
        print noise
        return ADist , noise
Ejemplo n.º 21
def targetFunction(data,
    Compute the average cost of medoids based on certain cost function
    and do the clustering given the medoids
    if costType not in ["total", "average", "modularity"]:
        print "unknown target function - check the global variables in the code"
        return (1)

    # Init the cluster
    size = len(data)
    total_cost = {}
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
        total_cost[idx] = 0.0
    assignErrors = []

    # Compute the distance and do the clustering
    for i in range(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        # medoids themselves are also included into resulting cluster lists
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m, i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    tmp = similarity_distance(data[m], data[i], simDict)
                    print('Error: unknown cost function idx: ' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m, i)] = tmp
            # Clustering

            # Randomization for nodes/points isolated from all the medoids
            # in order to assign them to random clusters. Hope averaging will
            # be able to glean cases for which some medoids did appear in the
            # same connected component, and group those nodes together.
            if tmp == 0.0 and min_cost == 0.0:  # no connection to either medoid
                rv = bernoulli.rvs(1. / len(medoids_idx), size=1)
                if rv[0] == 1.: choice = m
            elif tmp < min_cost:
                #if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if choice == -1:
            print "ERROR: the node cannot be assigned"
            total_cost[choice] += min_cost

    # Compute the target function
    if costType == "total":
        #print total_cost
        return (sum(total_cost.values()), medoids)

    elif costType == "average":
        # Compute the average cost
        avg_cost = 0.0
        for idx in medoids_idx:
            avg_cost += total_cost[idx] / len(medoids[idx])
        # Return the average cost and clustering
        return (avg_cost, medoids)

    elif costType == "modularity":
        # If the points are named, display the names
        if namedPoints == True:
            named_medoids = {}
            for medID in medoids_idx:
                named_medoids[data[medID]] = []
                for pointID in medoids[medID]:
            # "-" because we maximize modularity
            mod = -modularity(data,
            mod = -modularity(data,
        print "modularity computed"

        print "unknown target function"
        return (1)

    if len(assignErrors) > 0:
        print "unassigned nodes: ", assignErrors
        print "no unassigned nodes, all right"

    return (mod, medoids)
Ejemplo n.º 22
def is_BIH_inlier(all_BIH_ip, corner, pix_dist=5):
    return any([(util.euclidean_distance(ip, corner) <= pix_dist)
                for ip in all_BIH_ip])
Ejemplo n.º 23
 def pacman_will_die(self, next_pacman_position, next_ghost_positions):
     for next_ghost_position in next_ghost_positions:
         pacman_distance_from_ghost = util.euclidean_distance(next_pacman_position, next_ghost_position)
         if pacman_distance_from_ghost <= ReflexAgent.pacman_distance_from_ghost_coefficient:
             return True
     return False
Ejemplo n.º 24
def totalCost(data, costF_idx, medoids_idx, cacheOn=CacheOn, distDict={}, simDict={}, acceleration=0):
    Compute the total cost and do the clustering based on certain cost function
    (that is, assign each data point to certain cluster given the medoids)
    # Init the cluster
    size = len(data)
    total_cost = 0.0
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
    # medoids['unassigned'] = []
    unassigned = []
    tmp = None

    # Compute the distance and do the clustering
    for i in xrange(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m, i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                        tmp = similarity_distance(data[m], data[i], simDict)
                        print m, i
                        print data[m]
                        print data[i]
                    print('Error: unknown cost function idx: %d' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m, i)] = tmp
            # Clustering
            if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if min_cost == 0:  # 0 similarity to all the medoids
            unassigned.append(i)  # medoids['unassigned'].append(i)
        total_cost += min_cost

    if acceleration == 2:
        transformed_medoids = {} #dict(medoids)
        for i, m in enumerate(medoids.keys()):
            #print i, m
            transformed_medoids[str(i)] = {'med': m, 'nodes': medoids[m]}
            #transformed_medoids[i] = transformed_medoids.pop(m)
        return (total_cost, transformed_medoids)

    # Return the total cost and clustering
    return (total_cost, medoids )
Ejemplo n.º 25
 def overlaps(self, other: "Planet") -> bool:
     dist = euclidean_distance(*self.position, *other.position)
     return dist <= (self.radius + other.radius)
Ejemplo n.º 26
 def _overlaps(self, other: "Cell") -> bool:
     """Check if 2 cells overlaps each other"""
     distance = euclidean_distance(self.x, self.y, other.x, other.y)
     radius_sum = self.radius + other.radius
     return distance <= radius_sum
Ejemplo n.º 27
def totalCost(data,
    Compute the total cost and do the clustering based on certain cost function
    (that is, assign each data point to certain cluster given the medoids)
    # Init the cluster
    size = len(data)
    total_cost = 0.0
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
    # medoids['unassigned'] = []
    unassigned = []
    tmp = None

    # Compute the distance and do the clustering
    for i in xrange(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m, i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                        tmp = similarity_distance(data[m], data[i], simDict)
                        print m, i
                        print data[m]
                        print data[i]
                    print('Error: unknown cost function idx: %d' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m, i)] = tmp
            # Clustering
            if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if min_cost == 0:  # 0 similarity to all the medoids
            unassigned.append(i)  # medoids['unassigned'].append(i)
        total_cost += min_cost

    if acceleration == 2:
        transformed_medoids = {}  #dict(medoids)
        for i, m in enumerate(medoids.keys()):
            #print i, m
            transformed_medoids[str(i)] = {'med': m, 'nodes': medoids[m]}
            #transformed_medoids[i] = transformed_medoids.pop(m)
        return (total_cost, transformed_medoids)

    # Return the total cost and clustering
    return (total_cost, medoids)
Ejemplo n.º 28
def targetFunction(data, costF_idx, medoids_idx, cacheOn=False, distDict={},
                   simDict={}, affinities={}, costType=CostType,
    Compute the average cost of medoids based on certain cost function
    and do the clustering given the medoids
    if costType not in ["total", "average", "modularity"]:
        print "unknown target function - check the global variables in the code"

    # Init the cluster
    size = len(data)
    total_cost = {}
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
        total_cost[idx] = 0.0
    assignErrors = []

    # Compute the distance and do the clustering
    for i in range(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        # medoids themselves are also included into resulting cluster lists
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m,i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    tmp = similarity_distance(data[m], data[i], simDict)
                    print('Error: unknown cost function idx: ' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m,i)] = tmp
            # Clustering

            # Randomization for nodes/points isolated from all the medoids
            # in order to assign them to random clusters. Hope averaging will
            # be able to glean cases for which some medoids did appear in the
            # same connected component, and group those nodes together.
            if tmp==0.0 and min_cost==0.0: # no connection to either medoid
                rv = bernoulli.rvs(1./len(medoids_idx), size=1)
                if rv[0]==1.: choice = m
            elif tmp < min_cost:
                #if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if choice == -1:
            print "ERROR: the node cannot be assigned"
            total_cost[choice] += min_cost

    # Compute the target function
    if costType == "total":
        #print total_cost
        return(sum(total_cost.values()), medoids)

    elif costType == "average":
    # Compute the average cost
        avg_cost = 0.0
        for idx in medoids_idx:
            avg_cost += total_cost[idx] / len(medoids[idx])
        # Return the average cost and clustering
        return(avg_cost, medoids)

    elif costType == "modularity":
        # If the points are named, display the names
        if namedPoints == True:
            named_medoids = {}
            for medID in medoids_idx:
                named_medoids[data[medID]] = []
                for pointID in medoids[medID]:
            # "-" because we maximize modularity
            mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=named_medoids)
            mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=medoids)
        print "modularity computed"

        print "unknown target function"

    if len(assignErrors) > 0:
        print "unassigned nodes: ", assignErrors
        print "no unassigned nodes, all right"

    return(mod, medoids)
Ejemplo n.º 29
def cutNoise(matrix):
    print "total:",len(matrix)

    discard = []        # discard 无需返回, 根据 noise 和 real 可以得出
    noise = []
    real = []
    noise_matrix = []
    real_matrix = []

    rest = []
    rm = False
        rm = True
    for i,vector in enumerate(matrix):
        No_0 = 0                        # 记录非零特征
        for j in vector:
            if 0!=j:
                No_0 += 1
        if rm:
            low = 3
            low = 1

        if (low > No_0):                # discard 掉特征数量小于下限的向量

    print "discard:",len(discard)
    l = len(rest)
    distList = np.zeros((l,l),np.float)
    for i in range(l):
        distList[i][i] = float('inf')       # 自身不参与聚类比较
        for j in range(i+1,l):
            distList[i][j] = euclidean_distance(np.array(rest[i]), np.array(rest[j]))
            distList[j][i] = distList[i][j]
            # if(distList[i][j]==0):
            #     print i,":",rest[i]
            #     print j,":",rest[j]
    mostSimList = []        # 记录与第 i 个样本第 m 相似的距离
    m = 1
        noise = copy.copy(real)
        real = []
        noise_matrix = rest
        real_matrix = []
        tmp = []
        angles = []
        return real_matrix,noise_matrix,tmp,angles,real,noise

    marks = [i for i in range(l)]

    for i in range(l):
        lis = distList[i].tolist()
        lis = sorted(lis)

    ADist = zip(marks,mostSimList)
    ADist = sorted(ADist, key = lambda x: x[1], reverse=True)

    end = l-1
    Dlist = []
    while  end>=0:
        end -= 1
    # print Dlist
    # draw_line(Dlist)

    tmp, angles, part = min_Angle_part(Dlist)    # 注意 Dlist 里面应该是从小到大的顺序
    print part

    for e in ADist[0:(1+part)*l/10]:

    real0 = copy.copy(real)                 # 对应 rest 中的向量

    real = sorted(list(set(real) - set(noise)))
    noise = sorted(noise) 

    real_matrix = []
    for i , vector in enumerate(rest):
        if real0[i] in noise:
    return real_matrix,noise_matrix,tmp,angles,real,noise