def _re_calculate_proximity_matrix(self):
     start_time = time.time()
     
     num_processes = config.MAX_AVAILABLE_CPU_CORES * 3 # should be larger then available processors because the work is not evenly spread
     
     # Spread the work out around all the processes
     # Each process will have around the same number of data instances to handle, but higher index value has less calculations to do then lower index value data
     range_indices = np.array(range(0, self.num_data_instances))
     divided_indices = np.array_split(range_indices, num_processes)
     
     # Execute multi-processor functions
     with concurrent.futures.ProcessPoolExecutor(max_workers=config.MAX_AVAILABLE_CPU_CORES) as executor:
         results = [executor.submit(initial_proxy_matrix_distance_calculator_thread, index, divided_indices[index][0], divided_indices[index][-1], self.num_data_instances, self.training_data) for index in range(num_processes)]   
         
         # collect the results as the processes complete and fill out the proximity matrix
         for f in concurrent.futures.as_completed(results):
             index, results = f.result()
             for x in range(results.shape[0]):
                 x_pos = divided_indices[index][0] + x
                 for y in range(x_pos+1, self.num_data_instances):
                     self.proximity_matrix[x_pos][y] = results[x][y]
                     self.proximity_matrix[y][x_pos] = results[x][y]
     
     config.PrintDebug("Completed in {:.2f} seconds. Sum {}".format((time.time() - start_time), np.nansum(self.proximity_matrix)))
     
     with open(self.INITIAL_PROXY_MATRIX_FILE_NAME, 'wb') as f:
         config.PrintDebug("Saving Initial Proximity Matrix Into {}".format(self.INITIAL_PROXY_MATRIX_FILE_NAME))
         np.save(f, self.proximity_matrix)
     pass
 def _create_and_save_training_data_histograms(self, num_bins, image_paths_list):
     all_imagenames = []
     all_historgrams = []
     for image_path in image_paths_list:
         im_name = image_path.split('\\')[-1]
         config.PrintDebug("Processing: {}".format(im_name))
         
         # load each image file and change the pixel range to be from 0 - 1.
         image = skimage.io.imread(fname=image_path, as_gray=True) / config.U16_MAX_VAL
         # convert image to histogram
         histogram, bin_edges = np.histogram(image, bins=num_bins, range=(0, 1))
         # cache histogram into array
         all_historgrams.append(histogram)
         all_imagenames.append(im_name)
     
     # convert to numpy array and save it to file
     all_historgrams = np.array(all_historgrams)
     with open(self.HISTOGRAM_FILENAME, 'wb') as f:
         config.PrintDebug("Saving Histogram Data Into {}".format(self.HISTOGRAM_FILENAME))
         np.save(f, all_historgrams)
     config.PrintDebug("Process complete. Saved: {} Rows Of Data".format(all_historgrams.shape[0]))
     
     all_imagenames = np.array(all_imagenames)
     with open(self.TRAINING_IMG_NAMES_FILENAME, "wb") as f:
         config.PrintDebug("Saving Training Image Filenames Into {}".format(self.TRAINING_IMG_NAMES_FILENAME))
         np.save(f, all_imagenames)
     
     return (all_historgrams, all_imagenames)
 def _load_merge_history(self):
     with open(self.MERGE_HISTORY_FILE_NAME, 'rb') as f:
         config.PrintDebug("Loading Merge History From {}".format(self.MERGE_HISTORY_FILE_NAME))
         self.merge_history = np.load(f)
     with open(self.MERGE_CLUSTER_HISTORY_FILE_NAME, 'rb') as f:
         config.PrintDebug("Loading Merge Cluster History From {}".format(self.MERGE_CLUSTER_HISTORY_FILE_NAME))
         self.cluster_history = pickle.load(f)
     pass
    def _predict_cluster_and_accuracy(self, image_path, num_bins, num_clusters):
        # 1. Load image: load each image file and change the pixel range to be from 0 - 1.
        config.PrintDebug("Predicting: {}".format(image_path.split('\\')[-1]))
        
        image = skimage.io.imread(fname=image_path, as_gray=True) / config.U16_MAX_VAL
        # 2. Convert to histogram
        histogram, bin_edges = np.histogram(image, bins=num_bins, range=(0, 1))
        
        # 3. Find the right cluster and the distance to it
        cluster_distances = []
        for c in range(num_clusters):
            #dist_to_cluster = self._calculate_distance_to_cluster_V2(c, histogram)
            dist_to_cluster = self._calculate_distance_to_cluster(c, histogram, LINKAGE.AVERAGE)
            cluster_distances.append((c, dist_to_cluster))
        
        cluster_distances = sorted(cluster_distances, key = lambda x: x[1])  
        
        # 4. Subtrack the within cluster distance
        closest_cluster = cluster_distances[0][0]
        dist_to_closest = cluster_distances[0][1]        
        
        # if distance is less then 0 than it's inside the cluster
#        if dist_to_closest < 0:
#            dist_to_closest = 0
        
        # 5. Return results
        return (closest_cluster, dist_to_closest)
    def _create_clusters_from_training_data(self, num_clusters):
        # if recreating the clusters, delete the existing info
        if os.path.isdir(self.CLUSTER_INFO_FOLDER_NAME):
            shutil.rmtree(self.CLUSTER_INFO_FOLDER_NAME)
        
        # 1. Create array, one set for each cluster
        self.clusters = []
        for i in range(num_clusters):
            self.clusters.append(set())
        
        # 2. populate the first element with all elements
        for val in self.cluster_history[-1]:
            self.clusters[0].add(val)
        
        print("First has: {} elements".format(len(self.clusters[0])))
        # 3. loop and split of merged parts
        hist_ind = -2
        cluster_to_set = self.get_empty_clusterset_index(self.clusters)
        while cluster_to_set != -1:
            cluster_to_split = -1
            for val in self.cluster_history[hist_ind]:
                # find the right cluster to split
                if cluster_to_split == -1:
                    for k in range(num_clusters):
                        if val in self.clusters[k]:
                            cluster_to_split = k
                            break
                self.clusters[cluster_to_split].remove(val)
                self.clusters[cluster_to_set].add(val)
            hist_ind -= 1
            cluster_to_set = self.get_empty_clusterset_index(self.clusters)
 
    
        self.clusters = np.array(self.clusters)
        with open(self.CLUSTER_GROUPS_FILE_NAME, 'wb') as f:
            config.PrintDebug("Saving Cluster Groups Into {}".format(self.CLUSTER_GROUPS_FILE_NAME))
            np.save(f, self.clusters)
        
        if not os.path.exists(self.CLUSTER_INFO_FOLDER_NAME):
            os.makedirs(self.CLUSTER_INFO_FOLDER_NAME)
        
        # Save the image file names of each cluster into a file for evaluation
        for c in range(self.clusters.shape[0]):
            cluster_indices = list(self.clusters[c])
            with open("{}/Cluster_{}.txt".format(self.CLUSTER_INFO_FOLDER_NAME, c), 'w') as f:
                for cluster_index in cluster_indices:
                    f.write("{}\n".format(self.training_filenames[cluster_index]))
                    
        pass
 def calculate_within_cluster_variance(self):
     self.within_cluster_variance = [0.0] * len(self.clusters)
     for cluster_index in range(len(self.clusters)):
         print("Calculating Within-Cluster Variance for cluster: {}".format(cluster_index))
         cluster = list(self.clusters[cluster_index])
         
         within_dist_with = 0.0
         total_dist = []
         for i in range(0, len(cluster)-1):
             for k in range(i+1, len(cluster)):
                 total_dist.append(self.proximity_matrix[cluster[i]][cluster[k]])  
         within_dist_with = np.nanmean(total_dist)
         
         num_processes = config.MAX_AVAILABLE_CPU_CORES
         # Spread the work out around all the processes
         # Each process will have around the same number of data instances to handle, but higher index value has less calculations to do then lower index value data
         range_indices = np.array(range(0, len(cluster)))
         divided_indices = np.array_split(range_indices, num_processes)
         
         largest_diff = 0.0
         # Execute multi-processor functions
         with concurrent.futures.ProcessPoolExecutor(max_workers=config.MAX_AVAILABLE_CPU_CORES) as executor:
             results = [executor.submit(within_cluster_variance_calculator_thread, index, divided_indices[index][0], divided_indices[index][-1], cluster, self.proximity_matrix, within_dist_with) for index in range(num_processes)]   
         
             # collect the results as the processes complete and fill out the proximity matrix
             for f in concurrent.futures.as_completed(results):
                 result = f.result()
                 if result > largest_diff:
                     largest_diff = result
             
         self.within_cluster_variance[cluster_index] = largest_diff
         print("Cluster {}'s within-cluster distance variance is {}".format(cluster_index, largest_diff))
         
     with open(self.CLUSTER_VARIANCE_FILE_NAME, 'wb') as f:
         config.PrintDebug("Saving Cluster Variance Into {}".format(self.CLUSTER_VARIANCE_FILE_NAME))
         np.save(f, self.within_cluster_variance)
     pass
 def _load_within_cluster_variance(self):
     with open(self.CLUSTER_VARIANCE_FILE_NAME, 'rb') as f:
         config.PrintDebug("Loading Cluster Variance From {}".format(self.CLUSTER_VARIANCE_FILE_NAME))
         self.within_cluster_variance = np.load(f, allow_pickle=True)
     pass
 def _load_cluster_groups(self):
     with open(self.CLUSTER_GROUPS_FILE_NAME, 'rb') as f:
         config.PrintDebug("Loading Cluster Groups From {}".format(self.CLUSTER_GROUPS_FILE_NAME))
         self.clusters = np.load(f, allow_pickle=True)
     pass
 def _merge_clusters(self, linkage_type):        
     self.merge_history = []
     
     proxy_matrix_dupe = np.array(self.proximity_matrix, copy=True) # create duplicate of the initial proximity matrix
     current_largest_cluster_index = proxy_matrix_dupe.shape[0]     # rolling cluster index, each new cluster gets a new index
     row_indices = np.arange(current_largest_cluster_index)         # helper array to help refer back to the original histogram
     
     # Create an array of arrays tracking which elements are in each cluster.
     # At the start, each cluster index only contains itself
     self.cluster_history = []
     for i in range(current_largest_cluster_index):
         self.cluster_history.append([i])
     
     while proxy_matrix_dupe.shape[0] > 1:
         start_time = time.time()
         print("Meging Next Closest Clusters. {} Clusters are remaining".format(proxy_matrix_dupe.shape[0]))
         
         # 1. Find the next closes cluster
         smallest_index = self._find_smallest_index(proxy_matrix_dupe) # returns the row and column indices of the closest cluster
         smallest_distance = proxy_matrix_dupe[smallest_index[0]][smallest_index[1]] # gets the actual distance to this closest cluster
         smallest_original_index = [row_indices[smallest_index[0]], row_indices[smallest_index[1]]] # gets the original indices of this cluster referencing the original training data position
         
         # 2. Get which elements are in both clusters being merged
         cluster_one = self.cluster_history[smallest_original_index[0]]
         cluster_two = self.cluster_history[smallest_original_index[1]]
         both_clusters = cluster_one + cluster_two
         self.cluster_history.append(both_clusters) # keep track of all the elements in this new cluster
         
         # 3. delete the data belonging to the clusters being merged.
         proxy_matrix_dupe = np.delete(proxy_matrix_dupe, [smallest_index[0], smallest_index[1]], axis=0) # delete rows
         proxy_matrix_dupe = np.delete(proxy_matrix_dupe, [smallest_index[0], smallest_index[1]], axis=1) # delete cols
         # delete the row indices as well to make sure that the found index will continue referencing the original index
         row_indices = np.delete(row_indices, [smallest_index[0], smallest_index[1]])
         
         # 4. calculate the distances of this new cluster against all other clusters.
         # This has to happen after the deletion so we don't waste time
         distances = self._calculate_subset_of_proximity_matrix(linkage_type, row_indices, both_clusters, self.cluster_history)
         
         # 5. Add the distances to the proximity matrix, both as column and as row
         proxy_matrix_dupe = np.vstack((proxy_matrix_dupe, distances)) # add new row to 2d matrix
         distances = np.append(distances, np.NaN) # add an extra NaN to the end of the list, this will be for the distance to itself
         proxy_matrix_dupe = np.column_stack((proxy_matrix_dupe, distances)) # add new column to 2d matrix
         
         
         # 6. Keep tracking the index of this new cluster
         row_indices = np.append(row_indices, current_largest_cluster_index)
         current_largest_cluster_index += 1
         
         # 7. add result to merge_history
         # The merge history for each merge is an array with size 4:
         # Index 0 and 1 in this array references the indices of the clusters that got merged
         # Index 2 is the distance between the merged clusters
         # Index 3 is the size of the new cluster, how many training data instances it contains
         cluster_size = len(both_clusters)
         res = [smallest_original_index[0], smallest_original_index[1], smallest_distance, cluster_size]
         self.merge_history.append(res)
         
         config.PrintDebug("Merging {} with {} completed in {} seconds. Distance: {}, Cluster Size: {}".format(smallest_original_index[0], smallest_original_index[1], (time.time() - start_time), smallest_distance, cluster_size))
          
     self.merge_history = np.array(self.merge_history)
     with open(self.MERGE_HISTORY_FILE_NAME, 'wb') as f:
         config.PrintDebug("Saving Merge History Into {}".format(self.MERGE_HISTORY_FILE_NAME))
         np.save(f, self.merge_history)
     
     with open(self.MERGE_CLUSTER_HISTORY_FILE_NAME, 'wb') as f:
         config.PrintDebug("Saving Merge Cluster History Into {}".format(self.MERGE_CLUSTER_HISTORY_FILE_NAME))
         pickle.dump(self.cluster_history, f)
     pass
 def _load_initial_proximity_matrix(self):
     with open(self.INITIAL_PROXY_MATRIX_FILE_NAME, 'rb') as f:
         config.PrintDebug("Loading Initial Proximity Matrix From {}".format(self.INITIAL_PROXY_MATRIX_FILE_NAME))
         self.proximity_matrix = np.load(f)
     pass