def filter_candidate_pool(self, candidate_pool, char): """Filter candidate pool based on secondary distance metrics and selected threshold. Secondary distance is calculated between the image of the anchor character and the image of the candidate character. Args: candidate_pool: Set of Char, the set of possible confusables. char: Char, single character whose corresponding image must exists in self.img_dir. """ # Get secondary distance metrics image_metrics = Distance(self.img_format).get_metrics() if self.secondary_distance_type not in image_metrics.keys(): raise ValueError( "Expect secondary_distance_type to be one of {}.".format( image_metrics.keys())) secondary_dis = image_metrics[self.secondary_distance_type] # Filter candidate pool to get confusables confusables = [] for candidate in candidate_pool: if ord(char) == ord(candidate): continue dis = calculate_from_path(secondary_dis, self._label_img_map[char], self._label_img_map[candidate]) if dis <= self.secondary_filter_threshold: confusables.append((candidate, dis)) return confusables
def get_candidate_pool_for_char(self, char): """Obtain the candidates for confusables for specified 'char'. Use the reduced representations generated by PCA and select candidates based on primary distance metrics. Args: char: Char, single character, must exists in self.labels. Returns: candidate_pool: Set of Char, the set of possible confusables. candidate_dis: Dict, mapping from candidates to their respective distances. """ # Get character index in labels and embeddings idx = self.labels.index(char) # Get a pool of possible candidates for secondary filter candidate_pool = set() # Store distances between all confusables and anchor candidate_dis = dict() for embs in self._reps: # Get embedding anchor to compare with others emb_anchor = embs[idx] # Get primary distance metrics embedding_metrics = Distance(ImgFormat.EMBEDDINGS).get_metrics() if self.primary_distance_type not in embedding_metrics.keys(): raise ValueError( "Expect primary_distance_type to be one of {}.".format( embedding_metrics.keys())) primary_dis = embedding_metrics[self.primary_distance_type] # Get distance from anchor embedding to all other embeddings distances = [] for emb in embs: distances.append(primary_dis(emb_anchor, emb)) label_dis_pairs = list(zip(self.labels, distances)) # Get top n candidates using the primary distance metric top_n = [] for label, dis in label_dis_pairs: if len(top_n) < self.n_candidates: # Append reversed tuple for sorting bisect.insort(top_n, (dis, label)) else: if dis < top_n[self.n_candidates - 1][0]: # If the distance is lower than the largest of the # candidates we only keep top N bisect.insort(top_n, (dis, label)) top_n = top_n[:self.n_candidates - 1] # Store all candidate distances candidate_dis["PCA" + str(embs.shape[1])] = top_n candidate_pool = candidate_pool.union( set([entry[1] for entry in top_n])) return candidate_pool, candidate_dis