Beispiel #1
0
    def filter_candidate_pool(self, candidate_pool, char):
        """Filter candidate pool based on secondary distance metrics and
        selected threshold. Secondary distance is calculated between the image
        of the anchor character and the image of the candidate character.

        Args:
            candidate_pool: Set of Char, the set of possible confusables.
            char: Char, single character whose corresponding image must exists
                in self.img_dir.
        """
        # Get secondary distance metrics
        image_metrics = Distance(self.img_format).get_metrics()
        if self.secondary_distance_type not in image_metrics.keys():
            raise ValueError(
                "Expect secondary_distance_type to be one of {}.".format(
                    image_metrics.keys()))
        secondary_dis = image_metrics[self.secondary_distance_type]

        # Filter candidate pool to get confusables
        confusables = []
        for candidate in candidate_pool:
            if ord(char) == ord(candidate):
                continue
            dis = calculate_from_path(secondary_dis, self._label_img_map[char],
                                      self._label_img_map[candidate])
            if dis <= self.secondary_filter_threshold:
                confusables.append((candidate, dis))

        return confusables
Beispiel #2
0
    def get_candidate_pool_for_char(self, char):
        """Obtain the candidates for confusables for specified 'char'. Use
        the reduced representations generated by PCA and select candidates based
        on primary distance metrics.
        
        Args:
            char: Char, single character, must exists in self.labels.

        Returns:
            candidate_pool: Set of Char, the set of possible confusables.
            candidate_dis: Dict, mapping from candidates to their respective
                distances.
        """
        # Get character index in labels and embeddings
        idx = self.labels.index(char)
        # Get a pool of possible candidates for secondary filter
        candidate_pool = set()
        # Store distances between all confusables and anchor
        candidate_dis = dict()
        for embs in self._reps:
            # Get embedding anchor to compare with others
            emb_anchor = embs[idx]

            # Get primary distance metrics
            embedding_metrics = Distance(ImgFormat.EMBEDDINGS).get_metrics()
            if self.primary_distance_type not in embedding_metrics.keys():
                raise ValueError(
                    "Expect primary_distance_type to be one of {}.".format(
                        embedding_metrics.keys()))
            primary_dis = embedding_metrics[self.primary_distance_type]

            # Get distance from anchor embedding to all other embeddings
            distances = []
            for emb in embs:
                distances.append(primary_dis(emb_anchor, emb))
            label_dis_pairs = list(zip(self.labels, distances))

            # Get top n candidates using the primary distance metric
            top_n = []
            for label, dis in label_dis_pairs:
                if len(top_n) < self.n_candidates:
                    # Append reversed tuple for sorting
                    bisect.insort(top_n, (dis, label))
                else:
                    if dis < top_n[self.n_candidates - 1][0]:
                        # If the distance is lower than the largest of the
                        # candidates we only keep top N
                        bisect.insort(top_n, (dis, label))
                        top_n = top_n[:self.n_candidates - 1]

            # Store all candidate distances
            candidate_dis["PCA" + str(embs.shape[1])] = top_n
            candidate_pool = candidate_pool.union(
                set([entry[1] for entry in top_n]))

        return candidate_pool, candidate_dis