def dataset_values(coord_array): out = {} internal_distances = [ geom.distance(i[0], i[1]) for i in itertools.combinations(coord_array, 2) ] i_d_stdev_avg = statistic.stdev_avg(internal_distances) out["internal ditances avg"] = i_d_stdev_avg[1] out["internal ditances stdev"] = i_d_stdev_avg[0] out["internal distances max"] = max(internal_distances) out["internal distances min"] = min(internal_distances) centroid_stdev_avg = statistic.stdev_avg_array(coord_array) centroid = centroid_stdev_avg[1] out["centroid"] = centroid out["stdev of coordinates against centroid"] = centroid_stdev_avg[0] distances_centroid_point = [ geom.distance(i, centroid) for i in coord_array ] avg_stdev_d_c = statistic.stdev_avg(distances_centroid_point) out["avg distance from centroid"] = avg_stdev_d_c[1] out["stdev distance from centroid"] = avg_stdev_d_c[0] out["min distance from centroid"] = min(distances_centroid_point) out["max distance from centroid"] = max(distances_centroid_point) return out
def general_evaluate_clustered_object(data_obj): out = {} cluster_dict = data_obj.clusters_into_lists_dict() internal_distances = [] avg_internal_distances_per_cluster = [] for key in cluster_dict: internal_pairs = itertools.combinations(cluster_dict[key], 2) current_internal_distances = [ geom.distance(a[0], a[1]) for a in internal_pairs ] internal_distances = internal_distances + current_internal_distances avg_internal_distances_per_cluster.append( statistic.avg(current_internal_distances)) out["internal_distances"] = statistic.dict_evaluation(internal_distances) out["internal_distances_per_cluster"] = statistic.dict_evaluation( avg_internal_distances_per_cluster) external_distances = [] cluster_key_pairs = itertools.combinations(list(cluster_dict.keys()), 2) for i in cluster_key_pairs: external_distances = external_distances + list( itertools.product(cluster_dict[i[0]], cluster_dict[i[1]])) if (len(set(data_obj.labels))) > 1: external_distances = [ geom.distance(i[0], i[1]) for i in external_distances ] out["external_distances"] = statistic.dict_evaluation( external_distances) cluster_sizes = [] for i in cluster_dict: cluster_sizes.append(len(cluster_dict[key])) out["cluster_sizes"] = statistic.dict_evaluation(cluster_sizes) #centroids = [statistic.avg_array(cluster_dict[key]) for key in cluster_dict] centroids = [] distances_between_centroids_and_their_points = [] for key in cluster_dict: centroid = statistic.avg_array(cluster_dict[key]) centroids.append(centroid) distances = [geom.distance(centroid, i) for i in cluster_dict[key]] distances_between_centroids_and_their_points.extend(distances) out["centroid_and_their_points_distances"] = statistic.dict_evaluation( distances_between_centroids_and_their_points) if (len(set(data_obj.labels))) > 1: centroid_distances = [ geom.distance(a[0], a[1]) for a in list(itertools.combinations(centroids, 2)) ] out["centroid_distances"] = statistic.dict_evaluation( centroid_distances) return out
def dunn_index(data_obj): clusters = {} for num, i in enumerate(data_obj.coords): label = data_obj.labels[num] if label in clusters: clusters[label].append(i) else: clusters[label] = [i] centroids = {} avg_distances_to_center = {} for key in clusters: centroid = statistic.avg_coords(clusters[key]) centroids[key] = centroid distances_to_center = [ geom.distance(centroid, i) for i in clusters[key] ] avg_distances_to_center[key] = sum(distances_to_center) / len( distances_to_center) max_cluster_size = max(list(avg_distances_to_center.values())) cluster_key_pairs = list(itertools.combinations(list(clusters.keys()), 2)) resulting_distances_between_cluster_pairs = {} distances_between_cluster_centroids = {} for i in cluster_key_pairs: distances_between_cluster_centroids[i] = geom.distance( centroids[i[0]], centroids[i[1]]) distances_between_clusters_as_min_dist_between_pairs = {} for i in cluster_key_pairs: point_pairs_between_2_clusters = list( itertools.product(clusters[i[0]], clusters[i[1]])) distances_in_point_pairs = [ geom.distance(j[0], j[1]) for j in point_pairs_between_2_clusters ] distances_between_clusters_as_min_dist_between_pairs[i] = min( distances_in_point_pairs) for i in cluster_key_pairs: resulting_distances_between_cluster_pairs[i] = ( distances_between_cluster_centroids[i] + distances_between_clusters_as_min_dist_between_pairs[i]) / 2.0 min_distance_between_clusters = min( list(resulting_distances_between_cluster_pairs.values())) return min_distance_between_clusters / max_cluster_size
def _init_plus_plus(data_obj): data_obj.c_positions = [] remaining = data_obj.c_number if remaining > 0: remaining -= 1 data_obj.c_positions.append(random.choice(data_obj.coords)) distances = [float("inf") for i in data_obj.coords] while remaining > 0: last_center = data_obj.c_positions[-1] remaining -= 1 for num, i in enumerate(distances): current_coord = data_obj.coords[num] #print(last_center) newdistance = math.pow(geom.distance(last_center, current_coord), 2.0) distances[num] = min([distances[num], newdistance]) sums = sum(distances) next_distances = [i / sums for i in distances] indexes = list(range(len(data_obj.coords))) #print(distances) new_center = choice(indexes, 1, p=next_distances) new_center = data_obj.coords[new_center] data_obj.c_positions.append(new_center)
def linear_circle_point(coords,radius): while True: x = random.uniform(-radius,radius) y = random.uniform(-radius,radius) if geom.distance((0.0,0.0),(x,y))<radius: #print(math.sqrt(math.pow(x-coords[0],2)+math.pow(x-coords[0],2))) break return (x+coords[0],y+coords[1])
def triangulation_distance_within(coords): triangulated = geom.triangulate_set(coords) distances = [geom.distance(i[0], i[1]) for i in triangulated] avg = sum(distances) / len(distances) stdev = 0 for i in distances: stdev = stdev + math.pow(avg - i, 2) stdev = math.sqrt(stdev / len(distances)) return (avg, stdev)
def agglomerative_single_link(data_obj,**kwargs): anim_obj = kwargs.get("anim_obj",None) animated = False if anim_obj!=None: animated = True c = [[i] for i in range(len(data_obj.coords))] #dict of clusters, keys: int, first value used, val = indexes of points clusters = {} for i in c: clusters[i[0]]=i data_obj.labels = [0 for i in data_obj.coords] #dict of distances. key = frozenset(cluster_a,cluster_b) val = distance(cluster_a,cluster_b) distances ={} for i in itertools.combinations(clusters.keys(),2): distances[frozenset(i)]=geom.distance(data_obj.coords[i[0]],data_obj.coords[i[1]]) while len(clusters) > data_obj.c_number: # finding a set(c_index,c_index) where distance between indexes is minimal min_c = list(min(distances,key=distances.get)) a = min_c[0] b = min_c[1] # merging clusters clusters[min_c[0]].extend(clusters[min_c[1]]) clusters.pop(min_c[1],None) #merging distances form two clusters so that the larger distance betweeen cluster remains #what do we need: distance_hash_pairs = [] for i in clusters: added = [frozenset((i,a)),frozenset((i,b))] if len(added[0]) == 1: added[0] = added[1] if len(added[1]) ==1: added[1] = added[0] distance_hash_pairs.append(added) for i in distance_hash_pairs: distances[i[0]] = min((distances[i[0]],distances[i[1]])) distances.pop(i[1],None) #animation if animated: for key,val in clusters.items(): for i in val: data_obj.labels[i]=key anim_obj.add_step(data_obj) for key,val in clusters.items(): for i in val: data_obj.labels[i]=key return data_obj
def simplified_k_means(data_obj,indexes,iterations,cluster_num): centers = random.sample(indexes,cluster_num) centers = [data_obj.coords[i] for i in centers] #print(indexes) for i in range(iterations): subclusters =[[] for i in range(cluster_num)] distances = [i for i in indexes] distances = [[geom.distance(data_obj.coords[i],j) for j in centers] for i in distances ] for num,i in enumerate(distances): index = i.index(min(i)) subclusters[index].append(num) for num,i in enumerate(subclusters): centroided = [data_obj.coords[j] for j in i] if len(centroided) > 0: print(centroided) centers[num] = statistic.avg_coords(centroided) else: centers[num] = random.choice(data_obj.coords) return subclusters
def _balanced_centers(field_size,dist_function,dist_function_params,num_of_clusters): data_obj = C() data_obj.c_number = num_of_clusters counter = 0 passes = False while not passes: print("trying") data_obj.c_positions = [linear_square_point([field_size/2.0 for x in [0,1]],field_size)] while (not passes) and counter < 100*data_obj.c_number: newobj = linear_square_point([field_size/2.0 for x in [0,1]],field_size) temp_passes = True for i in data_obj.c_positions: if not dist_function(i,newobj,dist_function_params) and geom.distance(i,newobj) > 0 : temp_passes = False break if temp_passes: data_obj.c_positions.append(newobj) counter += 1 passes = (len(data_obj.c_positions)==data_obj.c_number) return data_obj
def _reassign_to_cluster_centers(data_obj): for num, i in enumerate(data_obj.coords): distances = [geom.distance(i, c) for c in data_obj.c_positions] mindist = min(distances) data_obj.labels[num] = distances.index(mindist)
def _euclidean_balanced(a,b,min_length): return geom.distance(a,b) > min_length