def link_lists_old(self, search_max=200, editCost=20, offsetCost=1, offsetInertia=5): DEBUG = False if DEBUG: offsetHist = [] jHist = [] editDistHist = 0 offset = self._calculate_offset(self.html_word_list, self.pdf_word_list, max((search_max // 10), 5), search_max) offsets = [offset] * offsetInertia searchOrder = np.array([(-1)**(i % 2) * (i // 2) for i in range(1, search_max + 1)]) links = OrderedDict() for i, a in enumerate(self.html_word_list): j = 0 searchIndices = np.clip(offset + searchOrder, 0, len(self.pdf_word_list) - 1) jMax = len(searchIndices) matched = False # Search first for exact matches while not matched and j < jMax: b = self.pdf_word_list[searchIndices[j]] if a[1] == b[1]: links[a[0]] = b[0] matched = True offsets[i % offsetInertia] = searchIndices[j] + 1 offset = int(np.median(offsets)) if DEBUG: jHist.append(j) offsetHist.append(offset) j += 1 # If necessary, search for min edit distance if not matched: cost = [0] * search_max for k, m in enumerate(searchIndices): cost[k] = ( editdist(a[1], self.pdf_word_list[m][1]) * editCost + k * offsetCost) nearest = np.argmin(cost) links[a[0]] = self.pdf_word_list[searchIndices[nearest]][0] if DEBUG: jHist.append(nearest) offsetHist.append(searchIndices[nearest]) editDistHist += 1 if DEBUG: self.logger.debug(offsetHist) self.logger.debug(jHist) self.logger.debug(editDistHist) self.offsetHist = offsetHist self.links = links if self.verbose: self.logger.debug( "Linked {:d} words to {:d} bounding boxes".format( len(self.html_word_list), len(self.pdf_word_list)))
def link_fuzzy(i): (_, word) = self.html_word_list[i] l = u = i l, u, L, U = get_anchors(l, u) offset = int(L + float(i - l) / (u - l) * (U - L)) searchIndices = np.clip(offset + search_order, 0, M - 1) cost = [0] * search_max for j, k in enumerate(searchIndices): other = self.pdf_word_list[k][1] if (word.startswith(other) or word.endswith(other) or other.startswith(word) or other.endswith(word)): html_to_pdf[i] = k return else: cost[j] = int(editdist( word, other)) * edit_cost + j * offset_cost html_to_pdf[i] = searchIndices[np.argmin(cost)] return
def get_lattice_similarity(lattice1: List[List[str]], lattice2: List[List[str]], threshold: float = 0.8, ignore_stress: bool = False) -> float: """ Compare two lattices to find the similarity ratio of the closest phonetic renderings of them "threshold" is the similarity we're trying to match to return a potential link, higher values help us avoid expensive computation for the actual similarity score. The range of the output is [0, 1], 0 being the least similar, and 1 indicating an identical phonetic rendering in the two lattices. This is from Python's difflib.SequenceMatcher, calculated as follows: Where T is the total number of elements in both sequences, and M is the number of matches, this is 2.0*M / T. See also: https://docs.python.org/3.7/library/difflib.html#difflib.SequenceMatcher.ratio """ max_ratio = 0 for p1 in lattice_to_phonemes( lattice1): # for each rendering from lattice1 if ignore_stress: p1 = remove_stress(p1) c1 = Counter(p1.split()) l1 = len(p1.split()) p1 = p1.split() for p2 in lattice_to_phonemes( lattice2): # for each rendering from lattice 2 # Jaccard similarity if ignore_stress: p2 = remove_stress(p2) c2 = Counter(p2.split()) l2 = len(p2.split()) if sum((c1 & c2).values()) * 2 / (l1 + l2) < threshold: # If the candidates can't pass an orderless filter, there's no use in getting an exact ratio from the # (more expensive) SequenceMatcher continue # m = SequenceMatcher(a=p1.split(), b=p2.split(), autojunk=False) # ratio = m.ratio() p2 = p2.split() ratio = 1 - editdist(p1, p2) / max(len(p1), len(p2)) # ratio = (len(p1) + len(p2) - 2 * dist) / (len(p1) + len(p2)) if ratio == 1: return 1 if ratio > max_ratio: max_ratio = ratio return max_ratio
def _masked_edit_dist(src, tsf, pivots, id2word, output_path): distances = [] i = 0 print('output write to:\n%s' % output_path) # print(output_path) fd = open(output_path, 'w') for s, t in zip(src, tsf): s_ = set(s) t_ = set(t) modified = (s_ | t_) - (s_ & t_) # s_masked = [w if w not in modified else 0 for w in s] # 0 = '_PAD' # t_masked = [w if w not in modified else 0 for w in t] # 0 = '_PAD' pivot_set = pivots[0] | pivots[1] s_masked = [w if w not in pivot_set else 0 for w in s] # 0 = '_PAD' t_masked = [w if w not in pivot_set else 0 for w in t] # 0 = '_PAD' s_masked_ = ' '.join([str(w) for w in s_masked]) t_masked_ = ' '.join([str(w) for w in t_masked]) ed = editdist(s_masked_, t_masked_) distances.append(ed) fd.write('s: %s\n' % _format_sentence(s, id2word, pivots)) fd.write('t: %s\n' % _format_sentence(t, id2word, pivots)) # debug # if(i < 5): # print('modified:', [id2word[w] for w in modified]) # print('s:', _format_sentence(s, id2word, pivots)) # print('t:', _format_sentence(t, id2word, pivots)) # print('s_masked:', _format_sentence(s_masked, id2word, pivots)) # print('t_masked:', _format_sentence(t_masked, id2word, pivots)) # print('ed %d' % ed) # i += 1 avg_dist = np.average(distances) distances = Counter(distances) dist_distribution = np.zeros(8) for i in range(8): if(i < 7): dist_distribution[i] = float(distances[i]) / len(src) else: dist_distribution[i] = 1 - dist_distribution[: i].sum() return avg_dist, distances, dist_distribution